files into a sensible directory hierarchy.
Signed-off-by: Keir Fraser <keir@xensource.com>
goto error_out;
}
- /* HVM domains must be put into shadow2 mode at the start of day */
+ /* HVM domains must be put into shadow mode at the start of day */
if ( xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_ENABLE,
NULL, 0, NULL,
XEN_DOMCTL_SHADOW_ENABLE_REFCOUNT |
subdir-y += cpu
subdir-y += genapic
subdir-y += hvm
+subdir-y += mm
subdir-y += oprofile
subdir-$(x86_32) += x86_32
obj-y += usercopy.o
obj-y += x86_emulate.o
-ifneq ($(pae),n)
-obj-$(x86_32) += shadow2-common.o shadow2_g2_on_s3.o shadow2_g3_on_s3.o
-else
-obj-$(x86_32) += shadow2-common.o shadow2_g2_on_s2.o
-endif
-
-obj-$(x86_64) += shadow2-common.o shadow2_g4_on_s4.o shadow2_g3_on_s3.o \
- shadow2_g2_on_s3.o
-
-guest_levels = $(subst g,,$(filter g%,$(subst ., ,$(subst _, ,$(subst shadow2_,,$(1))))))
-shadow_levels = $(subst s,,$(filter s%,$(subst ., ,$(subst _, ,$(subst shadow2_,,$(1))))))
-shadow2_defns = -DGUEST_PAGING_LEVELS=$(call guest_levels,$(1)) \
- -DSHADOW_PAGING_LEVELS=$(call shadow_levels,$(1))
-
-shadow2_%.o: shadow2.c $(HDRS) Makefile
- $(CC) $(CFLAGS) $(call shadow2_defns,$(@F)) -c $< -o $@
-
obj-$(crash_debug) += gdbstub.o
$(TARGET): $(TARGET)-syms boot/mkelf32
boot/mkelf32: boot/mkelf32.c
$(HOSTCC) $(HOSTCFLAGS) -o $@ $<
-shadow_guest32.o: shadow.c
-shadow_guest32pae.o: shadow.c
-
.PHONY: clean
clean::
rm -f asm-offsets.s xen.lds boot/*.o boot/*~ boot/core boot/mkelf32
#endif /* __x86_64__ */
- shadow2_lock_init(d);
- for ( i = 0; i <= SHADOW2_MAX_ORDER; i++ )
- INIT_LIST_HEAD(&d->arch.shadow2.freelists[i]);
- INIT_LIST_HEAD(&d->arch.shadow2.p2m_freelist);
- INIT_LIST_HEAD(&d->arch.shadow2.p2m_inuse);
- INIT_LIST_HEAD(&d->arch.shadow2.toplevel_shadows);
+ shadow_lock_init(d);
+ for ( i = 0; i <= SHADOW_MAX_ORDER; i++ )
+ INIT_LIST_HEAD(&d->arch.shadow.freelists[i]);
+ INIT_LIST_HEAD(&d->arch.shadow.p2m_freelist);
+ INIT_LIST_HEAD(&d->arch.shadow.p2m_inuse);
+ INIT_LIST_HEAD(&d->arch.shadow.toplevel_shadows);
if ( !is_idle_domain(d) )
{
void arch_domain_destroy(struct domain *d)
{
- shadow2_final_teardown(d);
+ shadow_final_teardown(d);
free_xenheap_pages(
d->arch.mm_perdomain_pt,
}
}
- /* Shadow2: make sure the domain has enough shadow memory to
+ /* Shadow: make sure the domain has enough shadow memory to
* boot another vcpu */
- if ( shadow2_mode_enabled(d)
- && d->arch.shadow2.total_pages < shadow2_min_acceptable_pages(d) )
+ if ( shadow_mode_enabled(d)
+ && d->arch.shadow.total_pages < shadow_min_acceptable_pages(d) )
{
destroy_gdt(v);
return -ENOMEM;
/* Don't redo final setup */
set_bit(_VCPUF_initialised, &v->vcpu_flags);
- if ( shadow2_mode_enabled(d) )
- shadow2_update_paging_modes(v);
+ if ( shadow_mode_enabled(d) )
+ shadow_update_paging_modes(v);
update_cr3(v);
for_each_vcpu ( d, v )
{
/* Drop ref to guest_table (from new_guest_cr3(), svm/vmx cr3 handling,
- * or sh2_update_paging_modes()) */
+ * or sh_update_paging_modes()) */
pfn = pagetable_get_pfn(v->arch.guest_table);
if ( pfn != 0 )
{
- if ( shadow2_mode_refcounts(d) )
+ if ( shadow_mode_refcounts(d) )
put_page(mfn_to_page(pfn));
else
put_page_and_type(mfn_to_page(pfn));
hvm_relinquish_guest_resources(d);
/* Tear down shadow mode stuff. */
- shadow2_teardown(d);
+ shadow_teardown(d);
/*
* Relinquish GDT mappings. No need for explicit unmapping of the LDT as
void arch_dump_domain_info(struct domain *d)
{
- if ( shadow2_mode_enabled(d) )
+ if ( shadow_mode_enabled(d) )
{
- printk(" shadow2 mode: ");
- if ( d->arch.shadow2.mode & SHM2_enable )
+ printk(" shadow mode: ");
+ if ( d->arch.shadow.mode & SHM2_enable )
printk("enabled ");
- if ( shadow2_mode_refcounts(d) )
+ if ( shadow_mode_refcounts(d) )
printk("refcounts ");
- if ( shadow2_mode_log_dirty(d) )
+ if ( shadow_mode_log_dirty(d) )
printk("log_dirty ");
- if ( shadow2_mode_translate(d) )
+ if ( shadow_mode_translate(d) )
printk("translate ");
- if ( shadow2_mode_external(d) )
+ if ( shadow_mode_external(d) )
printk("external ");
printk("\n");
}
(void)alloc_vcpu(d, i, i);
/* Set up CR3 value for write_ptbase */
- if ( shadow2_mode_enabled(v->domain) )
- shadow2_update_paging_modes(v);
+ if ( shadow_mode_enabled(v->domain) )
+ shadow_update_paging_modes(v);
else
update_cr3(v);
new_thread(v, dsi.v_kernentry, vstack_end, vstartinfo_start);
if ( opt_dom0_shadow )
- if ( shadow2_test_enable(d) == 0 )
- shadow2_update_paging_modes(v);
+ if ( shadow_test_enable(d) == 0 )
+ shadow_update_paging_modes(v);
if ( supervisor_mode_kernel )
{
d = find_domain_by_id(domctl->domain);
if ( d != NULL )
{
- ret = shadow2_domctl(d, &domctl->u.shadow_op, u_domctl);
+ ret = shadow_domctl(d, &domctl->u.shadow_op, u_domctl);
put_domain(d);
copy_to_guest(u_domctl, domctl, 1);
}
if (count > size)
count = size;
- gfn = shadow2_gva_to_gfn(v, vaddr);
- mfn = mfn_x(sh2_vcpu_gfn_to_mfn(v, gfn));
+ gfn = shadow_gva_to_gfn(v, vaddr);
+ mfn = mfn_x(sh_vcpu_gfn_to_mfn(v, gfn));
if (mfn == INVALID_MFN)
return 0;
return;
}
- if ( current->arch.shadow2.mode->guest_levels == 4 )
+ if ( current->arch.shadow.mode->guest_levels == 4 )
{
pregs->rax = hvm_hypercall64_table[pregs->rax](pregs->rdi,
pregs->rsi,
if (pvalid) {
if (hvm_paging_enabled(current))
- p->u.data = shadow2_gva_to_gpa(current, value);
+ p->u.data = shadow_gva_to_gpa(current, value);
else
p->u.pdata = (void *) value; /* guest VA == guest PA */
} else
if (pvalid) {
if (hvm_paging_enabled(v))
- p->u.data = shadow2_gva_to_gpa(v, value);
+ p->u.data = shadow_gva_to_gpa(v, value);
else
p->u.pdata = (void *) value; /* guest VA == guest PA */
} else
#include <xen/domain_page.h>
#include <asm/current.h>
#include <asm/io.h>
-#include <asm/shadow2.h>
+#include <asm/shadow.h>
#include <asm/regs.h>
#include <asm/cpufeature.h>
#include <asm/processor.h>
if ( v != d->vcpu[0] )
return;
- if ( !shadow2_mode_external(d) )
+ if ( !shadow_mode_external(d) )
{
DPRINTK("Can't init HVM for dom %u vcpu %u: "
- "not in shadow2 external mode\n", d->domain_id, v->vcpu_id);
+ "not in shadow external mode\n", d->domain_id, v->vcpu_id);
domain_crash(d);
}
va, eip, (unsigned long)regs->error_code);
//#endif
- result = shadow2_fault(va, regs);
+ result = shadow_fault(va, regs);
if( result ) {
/* Let's make sure that the Guest TLB is flushed */
v->arch.guest_table = pagetable_from_pfn(mfn);
if ( old_base_mfn )
put_page(mfn_to_page(old_base_mfn));
- shadow2_update_paging_modes(v);
+ shadow_update_paging_modes(v);
HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
(unsigned long) (mfn << PAGE_SHIFT));
svm_inject_exception(v, TRAP_gp_fault, 1, 0);
return 0;
}
- shadow2_update_paging_modes(v);
+ shadow_update_paging_modes(v);
vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3;
set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags);
}
else if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PE )
{
/* we should take care of this kind of situation */
- shadow2_update_paging_modes(v);
+ shadow_update_paging_modes(v);
vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3;
set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags);
}
mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
if (mfn != pagetable_get_pfn(v->arch.guest_table))
__hvm_bug(regs);
- shadow2_update_cr3(v);
+ shadow_update_cr3(v);
}
else
{
v->arch.guest_table = pagetable_from_pfn(mfn);
if ( old_base_mfn )
put_page(mfn_to_page(old_base_mfn));
- shadow2_update_paging_modes(v);
+ shadow_update_paging_modes(v);
HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
(unsigned long) (mfn << PAGE_SHIFT));
if ((old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE))
{
set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags);
- shadow2_update_paging_modes(v);
+ shadow_update_paging_modes(v);
}
break;
}
/* Overkill, we may not this */
set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags);
- shadow2_invlpg(v, g_vaddr);
+ shadow_invlpg(v, g_vaddr);
}
struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
unsigned long gpa;
- gpa = shadow2_gva_to_gpa(current, gva);
+ gpa = shadow_gva_to_gpa(current, gva);
printk( "gva = %lx, gpa=%lx, gCR3=%x\n", gva, gpa, (u32)vmcb->cr3 );
if( !svm_paging_enabled(v) || mmio_space(gpa) )
return;
if (svm_dbg_on && exit_reason == VMEXIT_EXCEPTION_PF)
{
if (svm_paging_enabled(v) &&
- !mmio_space(shadow2_gva_to_gpa(current, vmcb->exitinfo2)))
+ !mmio_space(shadow_gva_to_gpa(current, vmcb->exitinfo2)))
{
printk("I%08ld,ExC=%s(%d),IP=%x:%llx,"
"I1=%llx,I2=%llx,INT=%llx, "
(unsigned long long) vmcb->exitinfo1,
(unsigned long long) vmcb->exitinfo2,
(unsigned long long) vmcb->exitintinfo.bytes,
- (unsigned long long) shadow2_gva_to_gpa(current, vmcb->exitinfo2));
+ (unsigned long long) shadow_gva_to_gpa(current, vmcb->exitinfo2));
}
else
{
#include <xen/event.h>
#include <xen/kernel.h>
#include <xen/keyhandler.h>
-#include <asm/shadow2.h>
+#include <asm/shadow.h>
static int vmcs_size;
static int vmcs_order;
error |= __vmwrite(GUEST_TR_BASE, 0);
error |= __vmwrite(GUEST_TR_LIMIT, 0xff);
- shadow2_update_paging_modes(v);
+ shadow_update_paging_modes(v);
printk("%s(): GUEST_CR3<=%08lx, HOST_CR3<=%08lx\n",
__func__, v->arch.hvm_vcpu.hw_cr3, v->arch.cr3);
__vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
#include <asm/hvm/vmx/vmx.h>
#include <asm/hvm/vmx/vmcs.h>
#include <asm/hvm/vmx/cpu.h>
-#include <asm/shadow2.h>
+#include <asm/shadow.h>
#include <public/sched.h>
#include <public/hvm/ioreq.h>
#include <asm/hvm/vpic.h>
if ( v->vcpu_id != 0 )
return 1;
- if ( !shadow2_mode_external(d) )
+ if ( !shadow_mode_external(d) )
{
DPRINTK("Can't init HVM for dom %u vcpu %u: "
- "not in shadow2 external mode\n",
+ "not in shadow external mode\n",
d->domain_id, v->vcpu_id);
domain_crash(d);
}
}
#endif
- result = shadow2_fault(va, regs);
+ result = shadow_fault(va, regs);
TRACE_VMEXIT (2,result);
#if 0
* We do the safest things first, then try to update the shadow
* copying from guest
*/
- shadow2_invlpg(v, va);
+ shadow_invlpg(v, va);
}
skip_cr3:
- shadow2_update_paging_modes(v);
+ shadow_update_paging_modes(v);
if (!vmx_paging_enabled(v))
HVM_DBG_LOG(DBG_LEVEL_VMMU, "switching to vmxassist. use phys table");
else
v->arch.guest_table = pagetable_from_pfn(mfn);
if (old_base_mfn)
put_page(mfn_to_page(old_base_mfn));
- shadow2_update_paging_modes(v);
+ shadow_update_paging_modes(v);
HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
(unsigned long) (mfn << PAGE_SHIFT));
else if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PE )
{
__vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
- shadow2_update_paging_modes(v);
+ shadow_update_paging_modes(v);
}
return 1;
mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
if (mfn != pagetable_get_pfn(v->arch.guest_table))
__hvm_bug(regs);
- shadow2_update_cr3(v);
+ shadow_update_cr3(v);
} else {
/*
* If different, make a shadow. Check if the PDBR is valid
* all TLB entries except global entries.
*/
if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) )
- shadow2_update_paging_modes(v);
+ shadow_update_paging_modes(v);
break;
}
default:
res = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page);
- if ( !res && unlikely(shadow2_mode_refcounts(d)) )
+ if ( !res && unlikely(shadow_mode_refcounts(d)) )
{
- shadow2_lock(d);
- shadow2_remove_write_access(d->vcpu[0], _mfn(mfn), 0, 0);
+ shadow_lock(d);
+ shadow_remove_write_access(d->vcpu[0], _mfn(mfn), 0, 0);
res = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page);
- shadow2_unlock(d);
+ shadow_unlock(d);
}
if ( unlikely(!res) )
struct page_info *page;
unsigned long pfn;
- ASSERT( !shadow2_mode_refcounts(d) );
+ ASSERT( !shadow_mode_refcounts(d) );
if ( (root_get_flags(re) & _PAGE_RW) )
{
d = dom_io;
}
- /* Foreign mappings into guests in shadow2 external mode don't
+ /* Foreign mappings into guests in shadow external mode don't
* contribute to writeable mapping refcounts. (This allows the
* qemu-dm helper process in dom0 to map the domain's memory without
* messing up the count of "real" writable mappings.) */
okay = (((l1e_get_flags(l1e) & _PAGE_RW) &&
- !(unlikely(shadow2_mode_external(d) && (d != current->domain))))
+ !(unlikely(shadow_mode_external(d) && (d != current->domain))))
? get_page_and_type(page, d, PGT_writable_page)
: get_page(page, d));
if ( !okay )
}
/* Remember we didn't take a type-count of foreign writable mappings
- * to shadow2 external domains */
+ * to shadow external domains */
if ( (l1e_get_flags(l1e) & _PAGE_RW) &&
- !(unlikely((e != d) && shadow2_mode_external(e))) )
+ !(unlikely((e != d) && shadow_mode_external(e))) )
{
put_page_and_type(page);
}
l1_pgentry_t *pl1e;
int i;
- ASSERT(!shadow2_mode_refcounts(d));
+ ASSERT(!shadow_mode_refcounts(d));
pl1e = map_domain_page(pfn);
* a. alloc_l3_table() calls this function and this check will fail
* b. mod_l3_entry() disallows updates to slot 3 in an existing table
*
- * XXX -- this needs revisiting for shadow2_mode_refcount()==true...
+ * XXX -- this needs revisiting for shadow_mode_refcount()==true...
*/
page = l3e_get_page(l3e3);
BUG_ON(page->u.inuse.type_info & PGT_pinned);
l2_pgentry_t *pl2e;
int i;
- ASSERT(!shadow2_mode_refcounts(d));
+ ASSERT(!shadow_mode_refcounts(d));
pl2e = map_domain_page(pfn);
l3_pgentry_t *pl3e;
int i;
- ASSERT(!shadow2_mode_refcounts(d));
+ ASSERT(!shadow_mode_refcounts(d));
#ifdef CONFIG_X86_PAE
/*
unsigned long vaddr;
int i;
- ASSERT(!shadow2_mode_refcounts(d));
+ ASSERT(!shadow_mode_refcounts(d));
for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
{
struct vcpu *v)
{
int rv = 1;
- if ( unlikely(shadow2_mode_enabled(v->domain)) )
- shadow2_lock(v->domain);
+ if ( unlikely(shadow_mode_enabled(v->domain)) )
+ shadow_lock(v->domain);
#ifndef PTE_UPDATE_WITH_CMPXCHG
rv = (!__copy_to_user(pl1e, &nl1e, sizeof(nl1e)));
#else
}
}
#endif
- if ( unlikely(shadow2_mode_enabled(v->domain)) )
+ if ( unlikely(shadow_mode_enabled(v->domain)) )
{
- shadow2_validate_guest_entry(v, _mfn(gl1mfn), pl1e);
- shadow2_unlock(v->domain);
+ shadow_validate_guest_entry(v, _mfn(gl1mfn), pl1e);
+ shadow_unlock(v->domain);
}
return rv;
}
#endif
#define UPDATE_ENTRY(_t,_p,_o,_n,_m) ({ \
int rv; \
- if ( unlikely(shadow2_mode_enabled(current->domain)) ) \
- shadow2_lock(current->domain); \
+ if ( unlikely(shadow_mode_enabled(current->domain)) ) \
+ shadow_lock(current->domain); \
rv = _UPDATE_ENTRY(_t, _p, _o, _n); \
- if ( unlikely(shadow2_mode_enabled(current->domain)) ) \
+ if ( unlikely(shadow_mode_enabled(current->domain)) ) \
{ \
- shadow2_validate_guest_entry(current, _mfn(_m), (_p)); \
- shadow2_unlock(current->domain); \
+ shadow_validate_guest_entry(current, _mfn(_m), (_p)); \
+ shadow_unlock(current->domain); \
} \
rv; \
})
*/
this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_ALL_TLBS;
- if ( unlikely(shadow2_mode_enabled(owner)
- && !shadow2_lock_is_acquired(owner)) )
+ if ( unlikely(shadow_mode_enabled(owner)
+ && !shadow_lock_is_acquired(owner)) )
{
/* Raw page tables are rewritten during save/restore. */
- if ( !shadow2_mode_translate(owner) )
+ if ( !shadow_mode_translate(owner) )
mark_dirty(owner, page_to_mfn(page));
- if ( shadow2_mode_refcounts(owner) )
+ if ( shadow_mode_refcounts(owner) )
return;
gmfn = mfn_to_gmfn(owner, page_to_mfn(page));
ASSERT(VALID_M2P(gmfn));
- shadow2_lock(owner);
- shadow2_remove_all_shadows(owner->vcpu[0], _mfn(gmfn));
- shadow2_unlock(owner);
+ shadow_lock(owner);
+ shadow_remove_all_shadows(owner->vcpu[0], _mfn(gmfn));
+ shadow_unlock(owner);
}
}
#endif
/* Fixme: add code to propagate va_unknown to subtables. */
if ( ((type & PGT_type_mask) >= PGT_l2_page_table) &&
- !shadow2_mode_refcounts(page_get_owner(page)) )
+ !shadow_mode_refcounts(page_get_owner(page)) )
return 0;
/* This table is possibly mapped at multiple locations. */
nx &= ~PGT_va_mask;
if ( hvm_guest(v) && !hvm_paging_enabled(v) )
domain_crash_synchronous();
- if ( shadow2_mode_refcounts(d) )
+ if ( shadow_mode_refcounts(d) )
{
okay = get_page_from_pagenr(mfn, d);
if ( unlikely(!okay) )
if ( likely(old_base_mfn != 0) )
{
- if ( shadow2_mode_refcounts(d) )
+ if ( shadow_mode_refcounts(d) )
put_page(mfn_to_page(old_base_mfn));
else
put_page_and_type(mfn_to_page(old_base_mfn));
type = PGT_root_page_table;
pin_page:
- if ( shadow2_mode_refcounts(FOREIGNDOM) )
+ if ( shadow_mode_refcounts(FOREIGNDOM) )
break;
okay = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM);
break;
case MMUEXT_UNPIN_TABLE:
- if ( shadow2_mode_refcounts(d) )
+ if ( shadow_mode_refcounts(d) )
break;
if ( unlikely(!(okay = get_page_from_pagenr(mfn, d))) )
{
put_page_and_type(page);
put_page(page);
- if ( shadow2_mode_enabled(d) )
+ if ( shadow_mode_enabled(d) )
{
- shadow2_lock(d);
- shadow2_remove_all_shadows(v, _mfn(mfn));
- shadow2_unlock(d);
+ shadow_lock(d);
+ shadow_remove_all_shadows(v, _mfn(mfn));
+ shadow_unlock(d);
}
}
else
break;
case MMUEXT_INVLPG_LOCAL:
- if ( !shadow2_mode_enabled(d)
- || shadow2_invlpg(v, op.arg1.linear_addr) != 0 )
+ if ( !shadow_mode_enabled(d)
+ || shadow_invlpg(v, op.arg1.linear_addr) != 0 )
local_flush_tlb_one(op.arg1.linear_addr);
break;
unsigned long ptr = op.arg1.linear_addr;
unsigned long ents = op.arg2.nr_ents;
- if ( shadow2_mode_external(d) )
+ if ( shadow_mode_external(d) )
{
MEM_LOG("ignoring SET_LDT hypercall from external "
"domain %u", d->domain_id);
case PGT_l3_page_table:
case PGT_l4_page_table:
{
- if ( shadow2_mode_refcounts(d) )
+ if ( shadow_mode_refcounts(d) )
{
DPRINTK("mmu update on shadow-refcounted domain!");
break;
if ( unlikely(!get_page_type(page, PGT_writable_page)) )
break;
- if ( unlikely(shadow2_mode_enabled(d)) )
- shadow2_lock(d);
+ if ( unlikely(shadow_mode_enabled(d)) )
+ shadow_lock(d);
*(intpte_t *)va = req.val;
okay = 1;
- if ( unlikely(shadow2_mode_enabled(d)) )
+ if ( unlikely(shadow_mode_enabled(d)) )
{
- shadow2_validate_guest_entry(v, _mfn(mfn), va);
- shadow2_unlock(d);
+ shadow_validate_guest_entry(v, _mfn(mfn), va);
+ shadow_unlock(d);
}
put_page_type(page);
break;
}
- if ( shadow2_mode_translate(FOREIGNDOM) )
- shadow2_guest_physmap_add_page(FOREIGNDOM, gpfn, mfn);
+ if ( shadow_mode_translate(FOREIGNDOM) )
+ shadow_guest_physmap_add_page(FOREIGNDOM, gpfn, mfn);
else
set_gpfn_from_mfn(mfn, gpfn);
okay = 1;
goto failed;
}
- if ( !shadow2_mode_refcounts(d) )
+ if ( !shadow_mode_refcounts(d) )
put_page_from_l1e(ol1e, d);
put_page_type(page);
l2e_get_pfn(__linear_l2_table[l2_linear_offset(va)]), v) )
return GNTST_general_error;
- if ( !shadow2_mode_refcounts(d) )
+ if ( !shadow_mode_refcounts(d) )
put_page_from_l1e(ol1e, d);
return GNTST_okay;
perfc_incrc(calls_to_update_va);
- if ( unlikely(!__addr_ok(va) && !shadow2_mode_external(d)) )
+ if ( unlikely(!__addr_ok(va) && !shadow_mode_external(d)) )
return -EINVAL;
- if ( unlikely(shadow2_mode_refcounts(d)) )
+ if ( unlikely(shadow_mode_refcounts(d)) )
{
DPRINTK("Grant op on a shadow-refcounted domain\n");
return -EINVAL;
LOCK_BIGLOCK(d);
- if ( likely(rc == 0) && unlikely(shadow2_mode_enabled(d)) )
+ if ( likely(rc == 0) && unlikely(shadow_mode_enabled(d)) )
{
if ( unlikely(this_cpu(percpu_mm_info).foreign &&
- (shadow2_mode_translate(d) ||
- shadow2_mode_translate(
+ (shadow_mode_translate(d) ||
+ shadow_mode_translate(
this_cpu(percpu_mm_info).foreign))) )
{
/*
switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
{
case UVMF_LOCAL:
- if ( !shadow2_mode_enabled(d)
- || (shadow2_invlpg(current, va) != 0) )
+ if ( !shadow_mode_enabled(d)
+ || (shadow_invlpg(current, va) != 0) )
local_flush_tlb_one(va);
break;
case UVMF_ALL:
break;
}
- if ( !shadow2_mode_translate(d) || (mfn == 0) )
+ if ( !shadow_mode_translate(d) || (mfn == 0) )
{
put_domain(d);
return -EINVAL;
pl1e = (l1_pgentry_t *)((unsigned long)pl1e + (addr & ~PAGE_MASK));
if ( do_cmpxchg )
{
- if ( shadow2_mode_enabled(d) )
- shadow2_lock(d);
+ if ( shadow_mode_enabled(d) )
+ shadow_lock(d);
ol1e = l1e_from_intpte(old);
if ( cmpxchg((intpte_t *)pl1e, old, val) != old )
{
- if ( shadow2_mode_enabled(d) )
- shadow2_unlock(d);
+ if ( shadow_mode_enabled(d) )
+ shadow_unlock(d);
unmap_domain_page(pl1e);
put_page_from_l1e(nl1e, d);
return X86EMUL_CMPXCHG_FAILED;
}
- if ( unlikely(shadow2_mode_enabled(v->domain)) )
+ if ( unlikely(shadow_mode_enabled(v->domain)) )
{
- shadow2_validate_guest_entry(v, _mfn(page_to_mfn(page)), pl1e);
- shadow2_unlock(v->domain);
+ shadow_validate_guest_entry(v, _mfn(page_to_mfn(page)), pl1e);
+ shadow_unlock(v->domain);
}
}
else
--- /dev/null
+subdir-y += shadow
--- /dev/null
+ifneq ($(pae),n)
+obj-$(x86_32) += common.o g2_on_s3.o g3_on_s3.o
+else
+obj-$(x86_32) += common.o g2_on_s2.o
+endif
+
+obj-$(x86_64) += common.o g4_on_s4.o g3_on_s3.o g2_on_s3.o
+
+guest_levels = $(subst g,,$(filter g%,$(subst ., ,$(subst _, ,$(1)))))
+shadow_levels = $(subst s,,$(filter s%,$(subst ., ,$(subst _, ,$(1)))))
+shadow_defns = -DGUEST_PAGING_LEVELS=$(call guest_levels,$(1)) \
+ -DSHADOW_PAGING_LEVELS=$(call shadow_levels,$(1))
+
+g%.o: multi.c $(HDRS) Makefile
+ $(CC) $(CFLAGS) $(call shadow_defns,$(@F)) -c $< -o $@
--- /dev/null
+/******************************************************************************
+ * arch/x86/mm/shadow/common.c
+ *
+ * Shadow code that does not need to be multiply compiled.
+ * Parts of this code are Copyright (c) 2006 by XenSource Inc.
+ * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
+ * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#define SHADOW 1
+
+#include <xen/config.h>
+#include <xen/types.h>
+#include <xen/mm.h>
+#include <xen/trace.h>
+#include <xen/sched.h>
+#include <xen/perfc.h>
+#include <xen/irq.h>
+#include <xen/domain_page.h>
+#include <xen/guest_access.h>
+#include <xen/keyhandler.h>
+#include <asm/event.h>
+#include <asm/page.h>
+#include <asm/current.h>
+#include <asm/flushtlb.h>
+#include <asm/shadow.h>
+#include "private.h"
+
+#if SHADOW_AUDIT
+int shadow_audit_enable = 0;
+
+static void shadow_audit_key(unsigned char key)
+{
+ shadow_audit_enable = !shadow_audit_enable;
+ printk("%s shadow_audit_enable=%d\n",
+ __func__, shadow_audit_enable);
+}
+
+static int __init shadow_audit_key_init(void)
+{
+ register_keyhandler(
+ 'O', shadow_audit_key, "toggle shadow audits");
+ return 0;
+}
+__initcall(shadow_audit_key_init);
+#endif /* SHADOW_AUDIT */
+
+static void sh_free_log_dirty_bitmap(struct domain *d);
+
+int _shadow_mode_refcounts(struct domain *d)
+{
+ return shadow_mode_refcounts(d);
+}
+
+
+/**************************************************************************/
+/* x86 emulator support for the shadow code
+ */
+
+static int
+sh_x86_emulate_read_std(unsigned long addr,
+ unsigned long *val,
+ unsigned int bytes,
+ struct x86_emulate_ctxt *ctxt)
+{
+ struct vcpu *v = current;
+ if ( hvm_guest(v) )
+ {
+ *val = 0;
+ // XXX -- this is WRONG.
+ // It entirely ignores the permissions in the page tables.
+ // In this case, that is only a user vs supervisor access check.
+ //
+ if ( hvm_copy(val, addr, bytes, HVM_COPY_IN) )
+ {
+#if 0
+ SHADOW_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n",
+ v->domain->domain_id, v->vcpu_id,
+ addr, *val, bytes);
+#endif
+ return X86EMUL_CONTINUE;
+ }
+
+ /* If we got here, there was nothing mapped here, or a bad GFN
+ * was mapped here. This should never happen: we're here because
+ * of a write fault at the end of the instruction we're emulating. */
+ SHADOW_PRINTK("read failed to va %#lx\n", addr);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+ else
+ {
+ SHADOW_PRINTK("this operation is not emulated yet\n");
+ return X86EMUL_UNHANDLEABLE;
+ }
+}
+
+static int
+sh_x86_emulate_write_std(unsigned long addr,
+ unsigned long val,
+ unsigned int bytes,
+ struct x86_emulate_ctxt *ctxt)
+{
+ struct vcpu *v = current;
+#if 0
+ SHADOW_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n",
+ v->domain->domain_id, v->vcpu_id, addr, val, bytes);
+#endif
+ if ( hvm_guest(v) )
+ {
+ // XXX -- this is WRONG.
+ // It entirely ignores the permissions in the page tables.
+ // In this case, that includes user vs supervisor, and
+ // write access.
+ //
+ if ( hvm_copy(&val, addr, bytes, HVM_COPY_OUT) )
+ return X86EMUL_CONTINUE;
+
+ /* If we got here, there was nothing mapped here, or a bad GFN
+ * was mapped here. This should never happen: we're here because
+ * of a write fault at the end of the instruction we're emulating,
+ * which should be handled by sh_x86_emulate_write_emulated. */
+ SHADOW_PRINTK("write failed to va %#lx\n", addr);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+ else
+ {
+ SHADOW_PRINTK("this operation is not emulated yet\n");
+ return X86EMUL_UNHANDLEABLE;
+ }
+}
+
+static int
+sh_x86_emulate_write_emulated(unsigned long addr,
+ unsigned long val,
+ unsigned int bytes,
+ struct x86_emulate_ctxt *ctxt)
+{
+ struct vcpu *v = current;
+#if 0
+ SHADOW_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n",
+ v->domain->domain_id, v->vcpu_id, addr, val, bytes);
+#endif
+ if ( hvm_guest(v) )
+ {
+ return v->arch.shadow.mode->x86_emulate_write(v, addr, &val, bytes, ctxt);
+ }
+ else
+ {
+ SHADOW_PRINTK("this operation is not emulated yet\n");
+ return X86EMUL_UNHANDLEABLE;
+ }
+}
+
+static int
+sh_x86_emulate_cmpxchg_emulated(unsigned long addr,
+ unsigned long old,
+ unsigned long new,
+ unsigned int bytes,
+ struct x86_emulate_ctxt *ctxt)
+{
+ struct vcpu *v = current;
+#if 0
+ SHADOW_PRINTK("d=%u v=%u a=%#lx o?=%#lx n:=%#lx bytes=%u\n",
+ v->domain->domain_id, v->vcpu_id, addr, old, new, bytes);
+#endif
+ if ( hvm_guest(v) )
+ {
+ return v->arch.shadow.mode->x86_emulate_cmpxchg(v, addr, old, new,
+ bytes, ctxt);
+ }
+ else
+ {
+ SHADOW_PRINTK("this operation is not emulated yet\n");
+ return X86EMUL_UNHANDLEABLE;
+ }
+}
+
+static int
+sh_x86_emulate_cmpxchg8b_emulated(unsigned long addr,
+ unsigned long old_lo,
+ unsigned long old_hi,
+ unsigned long new_lo,
+ unsigned long new_hi,
+ struct x86_emulate_ctxt *ctxt)
+{
+ struct vcpu *v = current;
+#if 0
+ SHADOW_PRINTK("d=%u v=%u a=%#lx o?=%#lx:%lx n:=%#lx:%lx\n",
+ v->domain->domain_id, v->vcpu_id, addr, old_hi, old_lo,
+ new_hi, new_lo, ctxt);
+#endif
+ if ( hvm_guest(v) )
+ {
+ return v->arch.shadow.mode->x86_emulate_cmpxchg8b(v, addr, old_lo, old_hi,
+ new_lo, new_hi, ctxt);
+ }
+ else
+ {
+ SHADOW_PRINTK("this operation is not emulated yet\n");
+ return X86EMUL_UNHANDLEABLE;
+ }
+}
+
+
+struct x86_emulate_ops shadow_emulator_ops = {
+ .read_std = sh_x86_emulate_read_std,
+ .write_std = sh_x86_emulate_write_std,
+ .read_emulated = sh_x86_emulate_read_std,
+ .write_emulated = sh_x86_emulate_write_emulated,
+ .cmpxchg_emulated = sh_x86_emulate_cmpxchg_emulated,
+ .cmpxchg8b_emulated = sh_x86_emulate_cmpxchg8b_emulated,
+};
+
+
+/**************************************************************************/
+/* Code for "promoting" a guest page to the point where the shadow code is
+ * willing to let it be treated as a guest page table. This generally
+ * involves making sure there are no writable mappings available to the guest
+ * for this page.
+ */
+void shadow_promote(struct vcpu *v, mfn_t gmfn, u32 type)
+{
+ struct page_info *page = mfn_to_page(gmfn);
+ unsigned long type_info;
+
+ ASSERT(valid_mfn(gmfn));
+
+ /* We should never try to promote a gmfn that has writeable mappings */
+ ASSERT(shadow_remove_write_access(v, gmfn, 0, 0) == 0);
+
+ // Is the page already shadowed?
+ if ( !test_and_set_bit(_PGC_page_table, &page->count_info) )
+ {
+ // No prior shadow exists...
+
+ // Grab a type-ref. We don't really care if we are racing with another
+ // vcpu or not, or even what kind of type we get; we just want the type
+ // count to be > 0.
+ //
+ do {
+ type_info =
+ page->u.inuse.type_info & (PGT_type_mask | PGT_va_mask);
+ } while ( !get_page_type(page, type_info) );
+
+ // Now that the type ref is non-zero, we can safely use the
+ // shadow_flags.
+ //
+ page->shadow_flags = 0;
+ }
+
+ ASSERT(!test_bit(type >> PGC_SH_type_shift, &page->shadow_flags));
+ set_bit(type >> PGC_SH_type_shift, &page->shadow_flags);
+}
+
+void shadow_demote(struct vcpu *v, mfn_t gmfn, u32 type)
+{
+ struct page_info *page = mfn_to_page(gmfn);
+
+ ASSERT(test_bit(_PGC_page_table, &page->count_info));
+ ASSERT(test_bit(type >> PGC_SH_type_shift, &page->shadow_flags));
+
+ clear_bit(type >> PGC_SH_type_shift, &page->shadow_flags);
+
+ if ( (page->shadow_flags & SHF_page_type_mask) == 0 )
+ {
+ // release the extra type ref
+ put_page_type(page);
+
+ // clear the is-a-page-table bit.
+ clear_bit(_PGC_page_table, &page->count_info);
+ }
+}
+
+/**************************************************************************/
+/* Validate a pagetable change from the guest and update the shadows.
+ * Returns a bitmask of SHADOW_SET_* flags. */
+
+static int
+__shadow_validate_guest_entry(struct vcpu *v, mfn_t gmfn,
+ void *entry, u32 size)
+{
+ int result = 0;
+ struct page_info *page = mfn_to_page(gmfn);
+
+ sh_mark_dirty(v->domain, gmfn);
+
+ // Determine which types of shadows are affected, and update each.
+ //
+ // Always validate L1s before L2s to prevent another cpu with a linear
+ // mapping of this gmfn from seeing a walk that results from
+ // using the new L2 value and the old L1 value. (It is OK for such a
+ // guest to see a walk that uses the old L2 value with the new L1 value,
+ // as hardware could behave this way if one level of the pagewalk occurs
+ // before the store, and the next level of the pagewalk occurs after the
+ // store.
+ //
+ // Ditto for L2s before L3s, etc.
+ //
+
+ if ( !(page->count_info & PGC_page_table) )
+ return 0; /* Not shadowed at all */
+
+#if CONFIG_PAGING_LEVELS == 2
+ if ( page->shadow_flags & SHF_L1_32 )
+ result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 2, 2)
+ (v, gmfn, entry, size);
+#else
+ if ( page->shadow_flags & SHF_L1_32 )
+ result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 3, 2)
+ (v, gmfn, entry, size);
+#endif
+
+#if CONFIG_PAGING_LEVELS == 2
+ if ( page->shadow_flags & SHF_L2_32 )
+ result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 2, 2)
+ (v, gmfn, entry, size);
+#else
+ if ( page->shadow_flags & SHF_L2_32 )
+ result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 3, 2)
+ (v, gmfn, entry, size);
+#endif
+
+#if CONFIG_PAGING_LEVELS >= 3
+ if ( page->shadow_flags & SHF_L1_PAE )
+ result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 3, 3)
+ (v, gmfn, entry, size);
+ if ( page->shadow_flags & SHF_L2_PAE )
+ result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 3, 3)
+ (v, gmfn, entry, size);
+ if ( page->shadow_flags & SHF_L2H_PAE )
+ result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2he, 3, 3)
+ (v, gmfn, entry, size);
+ if ( page->shadow_flags & SHF_L3_PAE )
+ result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl3e, 3, 3)
+ (v, gmfn, entry, size);
+#else /* 32-bit non-PAE hypervisor does not support PAE guests */
+ ASSERT((page->shadow_flags & (SHF_L3_PAE|SHF_L2_PAE|SHF_L1_PAE)) == 0);
+#endif
+
+#if CONFIG_PAGING_LEVELS >= 4
+ if ( page->shadow_flags & SHF_L1_64 )
+ result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 4, 4)
+ (v, gmfn, entry, size);
+ if ( page->shadow_flags & SHF_L2_64 )
+ result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 4, 4)
+ (v, gmfn, entry, size);
+ if ( page->shadow_flags & SHF_L3_64 )
+ result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl3e, 4, 4)
+ (v, gmfn, entry, size);
+ if ( page->shadow_flags & SHF_L4_64 )
+ result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl4e, 4, 4)
+ (v, gmfn, entry, size);
+#else /* 32-bit/PAE hypervisor does not support 64-bit guests */
+ ASSERT((page->shadow_flags
+ & (SHF_L4_64|SHF_L3_64|SHF_L2_64|SHF_L1_64)) == 0);
+#endif
+
+ return result;
+}
+
+
+int
+shadow_validate_guest_entry(struct vcpu *v, mfn_t gmfn, void *entry)
+/* This is the entry point from hypercalls. It returns a bitmask of all the
+ * results of shadow_set_l*e() calls, so the caller knows to do TLB flushes. */
+{
+ int rc;
+
+ ASSERT(shadow_lock_is_acquired(v->domain));
+ rc = __shadow_validate_guest_entry(v, gmfn, entry, sizeof(l1_pgentry_t));
+ shadow_audit_tables(v);
+ return rc;
+}
+
+void
+shadow_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn,
+ void *entry, u32 size)
+/* This is the entry point for emulated writes to pagetables in HVM guests */
+{
+ struct domain *d = v->domain;
+ int rc;
+
+ ASSERT(shadow_lock_is_acquired(v->domain));
+ rc = __shadow_validate_guest_entry(v, gmfn, entry, size);
+ if ( rc & SHADOW_SET_FLUSH )
+ {
+ // Flush everyone except the local processor, which will flush when it
+ // re-enters the HVM guest.
+ //
+ cpumask_t mask = d->domain_dirty_cpumask;
+ cpu_clear(v->processor, mask);
+ flush_tlb_mask(mask);
+ }
+ if ( rc & SHADOW_SET_ERROR )
+ {
+ /* This page is probably not a pagetable any more: tear it out of the
+ * shadows, along with any tables that reference it */
+ shadow_remove_all_shadows_and_parents(v, gmfn);
+ }
+ /* We ignore the other bits: since we are about to change CR3 on
+ * VMENTER we don't need to do any extra TLB flushes. */
+}
+
+
+/**************************************************************************/
+/* Memory management for shadow pages. */
+
+/* Meaning of the count_info field in shadow pages
+ * ----------------------------------------------
+ *
+ * A count of all references to this page from other shadow pages and
+ * guest CR3s (a.k.a. v->arch.shadow.table).
+ *
+ * The top bits hold the shadow type and the pinned bit. Top-level
+ * shadows are pinned so that they don't disappear when not in a CR3
+ * somewhere.
+ *
+ * We don't need to use get|put_page for this as the updates are all
+ * protected by the shadow lock. We can't use get|put_page for this
+ * as the size of the count on shadow pages is different from that on
+ * normal guest pages.
+ */
+
+/* Meaning of the type_info field in shadow pages
+ * ----------------------------------------------
+ *
+ * type_info use depends on the shadow type (from count_info)
+ *
+ * PGC_SH_none : This page is in the shadow free pool. type_info holds
+ * the chunk order for our freelist allocator.
+ *
+ * PGC_SH_l*_shadow : This page is in use as a shadow. type_info
+ * holds the mfn of the guest page being shadowed,
+ *
+ * PGC_SH_fl1_*_shadow : This page is being used to shatter a superpage.
+ * type_info holds the gfn being shattered.
+ *
+ * PGC_SH_monitor_table : This page is part of a monitor table.
+ * type_info is not used.
+ */
+
+/* Meaning of the _domain field in shadow pages
+ * --------------------------------------------
+ *
+ * In shadow pages, this field will always have its least significant bit
+ * set. This ensures that all attempts to get_page() will fail (as all
+ * valid pickled domain pointers have a zero for their least significant bit).
+ * Instead, the remaining upper bits are used to record the shadow generation
+ * counter when the shadow was created.
+ */
+
+/* Meaning of the shadow_flags field
+ * ----------------------------------
+ *
+ * In guest pages that are shadowed, one bit for each kind of shadow they have.
+ *
+ * In shadow pages, will be used for holding a representation of the populated
+ * entries in this shadow (either a min/max, or a bitmap, or ...)
+ *
+ * In monitor-table pages, holds the level of the particular page (to save
+ * spilling the shadow types into an extra bit by having three types of monitor
+ * page).
+ */
+
+/* Meaning of the list_head struct in shadow pages
+ * -----------------------------------------------
+ *
+ * In free shadow pages, this is used to hold the free-lists of chunks.
+ *
+ * In top-level shadow tables, this holds a linked-list of all top-level
+ * shadows (used for recovering memory and destroying shadows).
+ *
+ * In lower-level shadows, this holds the physical address of a higher-level
+ * shadow entry that holds a reference to this shadow (or zero).
+ */
+
+/* Allocating shadow pages
+ * -----------------------
+ *
+ * Most shadow pages are allocated singly, but there are two cases where we
+ * need to allocate multiple pages together.
+ *
+ * 1: Shadowing 32-bit guest tables on PAE or 64-bit shadows.
+ * A 32-bit guest l1 table covers 4MB of virtuial address space,
+ * and needs to be shadowed by two PAE/64-bit l1 tables (covering 2MB
+ * of virtual address space each). Similarly, a 32-bit guest l2 table
+ * (4GB va) needs to be shadowed by four PAE/64-bit l2 tables (1GB va
+ * each). These multi-page shadows are contiguous and aligned;
+ * functions for handling offsets into them are defined in shadow.c
+ * (shadow_l1_index() etc.)
+ *
+ * 2: Shadowing PAE top-level pages. Each guest page that contains
+ * any PAE top-level pages requires two shadow pages to shadow it.
+ * They contain alternating l3 tables and pae_l3_bookkeeping structs.
+ *
+ * This table shows the allocation behaviour of the different modes:
+ *
+ * Xen paging 32b pae pae 64b 64b 64b
+ * Guest paging 32b 32b pae 32b pae 64b
+ * PV or HVM * HVM * HVM HVM *
+ * Shadow paging 32b pae pae pae pae 64b
+ *
+ * sl1 size 4k 8k 4k 8k 4k 4k
+ * sl2 size 4k 16k 4k 16k 4k 4k
+ * sl3 size - - 8k - 8k 4k
+ * sl4 size - - - - - 4k
+ *
+ * We allocate memory from xen in four-page units and break them down
+ * with a simple buddy allocator. Can't use the xen allocator to handle
+ * this as it only works for contiguous zones, and a domain's shadow
+ * pool is made of fragments.
+ *
+ * In HVM guests, the p2m table is built out of shadow pages, and we provide
+ * a function for the p2m management to steal pages, in max-order chunks, from
+ * the free pool. We don't provide for giving them back, yet.
+ */
+
+/* Figure out the least acceptable quantity of shadow memory.
+ * The minimum memory requirement for always being able to free up a
+ * chunk of memory is very small -- only three max-order chunks per
+ * vcpu to hold the top level shadows and pages with Xen mappings in them.
+ *
+ * But for a guest to be guaranteed to successfully execute a single
+ * instruction, we must be able to map a large number (about thirty) VAs
+ * at the same time, which means that to guarantee progress, we must
+ * allow for more than ninety allocated pages per vcpu. We round that
+ * up to 128 pages, or half a megabyte per vcpu. */
+unsigned int shadow_min_acceptable_pages(struct domain *d)
+{
+ u32 vcpu_count = 0;
+ struct vcpu *v;
+
+ for_each_vcpu(d, v)
+ vcpu_count++;
+
+ return (vcpu_count * 128);
+}
+
+/* Using the type_info field to store freelist order */
+#define SH_PFN_ORDER(_p) ((_p)->u.inuse.type_info)
+#define SH_SET_PFN_ORDER(_p, _o) \
+ do { (_p)->u.inuse.type_info = (_o); } while (0)
+
+
+/* Figure out the order of allocation needed for a given shadow type */
+static inline u32
+shadow_order(u32 shadow_type)
+{
+#if CONFIG_PAGING_LEVELS > 2
+ static const u32 type_to_order[16] = {
+ 0, /* PGC_SH_none */
+ 1, /* PGC_SH_l1_32_shadow */
+ 1, /* PGC_SH_fl1_32_shadow */
+ 2, /* PGC_SH_l2_32_shadow */
+ 0, /* PGC_SH_l1_pae_shadow */
+ 0, /* PGC_SH_fl1_pae_shadow */
+ 0, /* PGC_SH_l2_pae_shadow */
+ 0, /* PGC_SH_l2h_pae_shadow */
+ 1, /* PGC_SH_l3_pae_shadow */
+ 0, /* PGC_SH_l1_64_shadow */
+ 0, /* PGC_SH_fl1_64_shadow */
+ 0, /* PGC_SH_l2_64_shadow */
+ 0, /* PGC_SH_l3_64_shadow */
+ 0, /* PGC_SH_l4_64_shadow */
+ 2, /* PGC_SH_p2m_table */
+ 0 /* PGC_SH_monitor_table */
+ };
+ u32 type = (shadow_type & PGC_SH_type_mask) >> PGC_SH_type_shift;
+ return type_to_order[type];
+#else /* 32-bit Xen only ever shadows 32-bit guests on 32-bit shadows. */
+ return 0;
+#endif
+}
+
+
+/* Do we have a free chunk of at least this order? */
+static inline int chunk_is_available(struct domain *d, int order)
+{
+ int i;
+
+ for ( i = order; i <= SHADOW_MAX_ORDER; i++ )
+ if ( !list_empty(&d->arch.shadow.freelists[i]) )
+ return 1;
+ return 0;
+}
+
+/* Dispatcher function: call the per-mode function that will unhook the
+ * non-Xen mappings in this top-level shadow mfn */
+void shadow_unhook_mappings(struct vcpu *v, mfn_t smfn)
+{
+ struct page_info *pg = mfn_to_page(smfn);
+ switch ( (pg->count_info & PGC_SH_type_mask) >> PGC_SH_type_shift )
+ {
+ case PGC_SH_l2_32_shadow >> PGC_SH_type_shift:
+#if CONFIG_PAGING_LEVELS == 2
+ SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings,2,2)(v,smfn);
+#else
+ SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings,3,2)(v,smfn);
+#endif
+ break;
+#if CONFIG_PAGING_LEVELS >= 3
+ case PGC_SH_l3_pae_shadow >> PGC_SH_type_shift:
+ SHADOW_INTERNAL_NAME(sh_unhook_pae_mappings,3,3)(v,smfn);
+ break;
+#endif
+#if CONFIG_PAGING_LEVELS >= 4
+ case PGC_SH_l4_64_shadow >> PGC_SH_type_shift:
+ SHADOW_INTERNAL_NAME(sh_unhook_64b_mappings,4,4)(v,smfn);
+ break;
+#endif
+ default:
+ SHADOW_PRINTK("top-level shadow has bad type %08lx\n",
+ (unsigned long)((pg->count_info & PGC_SH_type_mask)
+ >> PGC_SH_type_shift));
+ BUG();
+ }
+}
+
+
+/* Make sure there is at least one chunk of the required order available
+ * in the shadow page pool. This must be called before any calls to
+ * shadow_alloc(). Since this will free existing shadows to make room,
+ * it must be called early enough to avoid freeing shadows that the
+ * caller is currently working on. */
+void shadow_prealloc(struct domain *d, unsigned int order)
+{
+ /* Need a vpcu for calling unpins; for now, since we don't have
+ * per-vcpu shadows, any will do */
+ struct vcpu *v = d->vcpu[0];
+ struct list_head *l, *t;
+ struct page_info *pg;
+ mfn_t smfn;
+
+ if ( chunk_is_available(d, order) ) return;
+
+ /* Stage one: walk the list of top-level pages, unpinning them */
+ perfc_incrc(shadow_prealloc_1);
+ list_for_each_backwards_safe(l, t, &d->arch.shadow.toplevel_shadows)
+ {
+ pg = list_entry(l, struct page_info, list);
+ smfn = page_to_mfn(pg);
+
+#if CONFIG_PAGING_LEVELS >= 3
+ if ( (pg->count_info & PGC_SH_type_mask) == PGC_SH_l3_pae_shadow )
+ {
+ /* For PAE, we need to unpin each subshadow on this shadow */
+ SHADOW_INTERNAL_NAME(sh_unpin_all_l3_subshadows,3,3)(v, smfn);
+ }
+ else
+#endif /* 32-bit code always takes this branch */
+ {
+ /* Unpin this top-level shadow */
+ sh_unpin(v, smfn);
+ }
+
+ /* See if that freed up a chunk of appropriate size */
+ if ( chunk_is_available(d, order) ) return;
+ }
+
+ /* Stage two: all shadow pages are in use in hierarchies that are
+ * loaded in cr3 on some vcpu. Walk them, unhooking the non-Xen
+ * mappings. */
+ perfc_incrc(shadow_prealloc_2);
+ v = current;
+ if ( v->domain != d )
+ v = d->vcpu[0];
+ /* Walk the list from the tail: recently used toplevels have been pulled
+ * to the head */
+ list_for_each_backwards_safe(l, t, &d->arch.shadow.toplevel_shadows)
+ {
+ pg = list_entry(l, struct page_info, list);
+ smfn = page_to_mfn(pg);
+ shadow_unhook_mappings(v, smfn);
+
+ /* Need to flush TLB if we've altered our own tables */
+ if ( !shadow_mode_external(d)
+ && pagetable_get_pfn(current->arch.shadow_table) == mfn_x(smfn) )
+ local_flush_tlb();
+
+ /* See if that freed up a chunk of appropriate size */
+ if ( chunk_is_available(d, order) ) return;
+ }
+
+ /* Nothing more we can do: all remaining shadows are of pages that
+ * hold Xen mappings for some vcpu. This can never happen. */
+ SHADOW_PRINTK("Can't pre-allocate %i shadow pages!\n"
+ " shadow pages total = %u, free = %u, p2m=%u\n",
+ 1 << order,
+ d->arch.shadow.total_pages,
+ d->arch.shadow.free_pages,
+ d->arch.shadow.p2m_pages);
+ BUG();
+}
+
+
+/* Allocate another shadow's worth of (contiguous, aligned) pages,
+ * and fill in the type and backpointer fields of their page_infos.
+ * Never fails to allocate. */
+mfn_t shadow_alloc(struct domain *d,
+ u32 shadow_type,
+ unsigned long backpointer)
+{
+ struct page_info *pg = NULL;
+ unsigned int order = shadow_order(shadow_type);
+ cpumask_t mask;
+ void *p;
+ int i;
+
+ ASSERT(shadow_lock_is_acquired(d));
+ ASSERT(order <= SHADOW_MAX_ORDER);
+ ASSERT(shadow_type != PGC_SH_none);
+ perfc_incrc(shadow_alloc);
+
+ /* Find smallest order which can satisfy the request. */
+ for ( i = order; i <= SHADOW_MAX_ORDER; i++ )
+ if ( !list_empty(&d->arch.shadow.freelists[i]) )
+ {
+ pg = list_entry(d->arch.shadow.freelists[i].next,
+ struct page_info, list);
+ list_del(&pg->list);
+
+ /* We may have to halve the chunk a number of times. */
+ while ( i != order )
+ {
+ i--;
+ SH_SET_PFN_ORDER(pg, i);
+ list_add_tail(&pg->list, &d->arch.shadow.freelists[i]);
+ pg += 1 << i;
+ }
+ d->arch.shadow.free_pages -= 1 << order;
+
+ /* Init page info fields and clear the pages */
+ for ( i = 0; i < 1<<order ; i++ )
+ {
+ pg[i].u.inuse.type_info = backpointer;
+ pg[i].count_info = shadow_type;
+ pg[i].shadow_flags = 0;
+ INIT_LIST_HEAD(&pg[i].list);
+ /* Before we overwrite the old contents of this page,
+ * we need to be sure that no TLB holds a pointer to it. */
+ mask = d->domain_dirty_cpumask;
+ tlbflush_filter(mask, pg[i].tlbflush_timestamp);
+ if ( unlikely(!cpus_empty(mask)) )
+ {
+ perfc_incrc(shadow_alloc_tlbflush);
+ flush_tlb_mask(mask);
+ }
+ /* Now safe to clear the page for reuse */
+ p = sh_map_domain_page(page_to_mfn(pg+i));
+ ASSERT(p != NULL);
+ clear_page(p);
+ sh_unmap_domain_page(p);
+ perfc_incr(shadow_alloc_count);
+ }
+ return page_to_mfn(pg);
+ }
+
+ /* If we get here, we failed to allocate. This should never happen.
+ * It means that we didn't call shadow_prealloc() correctly before
+ * we allocated. We can't recover by calling prealloc here, because
+ * we might free up higher-level pages that the caller is working on. */
+ SHADOW_PRINTK("Can't allocate %i shadow pages!\n", 1 << order);
+ BUG();
+}
+
+
+/* Return some shadow pages to the pool. */
+void shadow_free(struct domain *d, mfn_t smfn)
+{
+ struct page_info *pg = mfn_to_page(smfn);
+ u32 shadow_type;
+ unsigned long order;
+ unsigned long mask;
+ int i;
+
+ ASSERT(shadow_lock_is_acquired(d));
+ perfc_incrc(shadow_free);
+
+ shadow_type = pg->count_info & PGC_SH_type_mask;
+ ASSERT(shadow_type != PGC_SH_none);
+ ASSERT(shadow_type != PGC_SH_p2m_table);
+ order = shadow_order(shadow_type);
+
+ d->arch.shadow.free_pages += 1 << order;
+
+ for ( i = 0; i < 1<<order; i++ )
+ {
+ /* Strip out the type: this is now a free shadow page */
+ pg[i].count_info = 0;
+ /* Remember the TLB timestamp so we will know whether to flush
+ * TLBs when we reuse the page. Because the destructors leave the
+ * contents of the pages in place, we can delay TLB flushes until
+ * just before the allocator hands the page out again. */
+ pg[i].tlbflush_timestamp = tlbflush_current_time();
+ perfc_decr(shadow_alloc_count);
+ }
+
+ /* Merge chunks as far as possible. */
+ while ( order < SHADOW_MAX_ORDER )
+ {
+ mask = 1 << order;
+ if ( (mfn_x(page_to_mfn(pg)) & mask) ) {
+ /* Merge with predecessor block? */
+ if ( (((pg-mask)->count_info & PGC_SH_type_mask) != PGT_none)
+ || (SH_PFN_ORDER(pg-mask) != order) )
+ break;
+ list_del(&(pg-mask)->list);
+ pg -= mask;
+ } else {
+ /* Merge with successor block? */
+ if ( (((pg+mask)->count_info & PGC_SH_type_mask) != PGT_none)
+ || (SH_PFN_ORDER(pg+mask) != order) )
+ break;
+ list_del(&(pg+mask)->list);
+ }
+ order++;
+ }
+
+ SH_SET_PFN_ORDER(pg, order);
+ list_add_tail(&pg->list, &d->arch.shadow.freelists[order]);
+}
+
+/* Divert some memory from the pool to be used by the p2m mapping.
+ * This action is irreversible: the p2m mapping only ever grows.
+ * That's OK because the p2m table only exists for external domains,
+ * and those domains can't ever turn off shadow mode.
+ * Also, we only ever allocate a max-order chunk, so as to preserve
+ * the invariant that shadow_prealloc() always works.
+ * Returns 0 iff it can't get a chunk (the caller should then
+ * free up some pages in domheap and call set_sh_allocation);
+ * returns non-zero on success.
+ */
+static int
+shadow_alloc_p2m_pages(struct domain *d)
+{
+ struct page_info *pg;
+ u32 i;
+ ASSERT(shadow_lock_is_acquired(d));
+
+ if ( d->arch.shadow.total_pages
+ < (shadow_min_acceptable_pages(d) + (1<<SHADOW_MAX_ORDER)) )
+ return 0; /* Not enough shadow memory: need to increase it first */
+
+ pg = mfn_to_page(shadow_alloc(d, PGC_SH_p2m_table, 0));
+ d->arch.shadow.p2m_pages += (1<<SHADOW_MAX_ORDER);
+ d->arch.shadow.total_pages -= (1<<SHADOW_MAX_ORDER);
+ for (i = 0; i < (1<<SHADOW_MAX_ORDER); i++)
+ {
+ /* Unlike shadow pages, mark p2m pages as owned by the domain */
+ page_set_owner(&pg[i], d);
+ list_add_tail(&pg[i].list, &d->arch.shadow.p2m_freelist);
+ }
+ return 1;
+}
+
+// Returns 0 if no memory is available...
+mfn_t
+shadow_alloc_p2m_page(struct domain *d)
+{
+ struct list_head *entry;
+ mfn_t mfn;
+ void *p;
+
+ if ( list_empty(&d->arch.shadow.p2m_freelist) &&
+ !shadow_alloc_p2m_pages(d) )
+ return _mfn(0);
+ entry = d->arch.shadow.p2m_freelist.next;
+ list_del(entry);
+ list_add_tail(entry, &d->arch.shadow.p2m_inuse);
+ mfn = page_to_mfn(list_entry(entry, struct page_info, list));
+ sh_get_ref(mfn, 0);
+ p = sh_map_domain_page(mfn);
+ clear_page(p);
+ sh_unmap_domain_page(p);
+
+ return mfn;
+}
+
+#if CONFIG_PAGING_LEVELS == 3
+static void p2m_install_entry_in_monitors(struct domain *d,
+ l3_pgentry_t *l3e)
+/* Special case, only used for external-mode domains on PAE hosts:
+ * update the mapping of the p2m table. Once again, this is trivial in
+ * other paging modes (one top-level entry points to the top-level p2m,
+ * no maintenance needed), but PAE makes life difficult by needing a
+ * copy the eight l3es of the p2m table in eight l2h slots in the
+ * monitor table. This function makes fresh copies when a p2m l3e
+ * changes. */
+{
+ l2_pgentry_t *ml2e;
+ struct vcpu *v;
+ unsigned int index;
+
+ index = ((unsigned long)l3e & ~PAGE_MASK) / sizeof(l3_pgentry_t);
+ ASSERT(index < MACHPHYS_MBYTES>>1);
+
+ for_each_vcpu(d, v)
+ {
+ if ( pagetable_get_pfn(v->arch.monitor_table) == 0 )
+ continue;
+ ASSERT(shadow_mode_external(v->domain));
+
+ SHADOW_DEBUG(P2M, "d=%u v=%u index=%u mfn=%#lx\n",
+ d->domain_id, v->vcpu_id, index, l3e_get_pfn(*l3e));
+
+ if ( v == current ) /* OK to use linear map of monitor_table */
+ ml2e = __linear_l2_table + l2_linear_offset(RO_MPT_VIRT_START);
+ else
+ {
+ l3_pgentry_t *ml3e;
+ ml3e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
+ ASSERT(l3e_get_flags(ml3e[3]) & _PAGE_PRESENT);
+ ml2e = sh_map_domain_page(_mfn(l3e_get_pfn(ml3e[3])));
+ ml2e += l2_table_offset(RO_MPT_VIRT_START);
+ sh_unmap_domain_page(ml3e);
+ }
+ ml2e[index] = l2e_from_pfn(l3e_get_pfn(*l3e), __PAGE_HYPERVISOR);
+ if ( v != current )
+ sh_unmap_domain_page(ml2e);
+ }
+}
+#endif
+
+// Find the next level's P2M entry, checking for out-of-range gfn's...
+// Returns NULL on error.
+//
+static l1_pgentry_t *
+p2m_find_entry(void *table, unsigned long *gfn_remainder,
+ unsigned long gfn, u32 shift, u32 max)
+{
+ u32 index;
+
+ index = *gfn_remainder >> shift;
+ if ( index >= max )
+ {
+ SHADOW_DEBUG(P2M, "gfn=0x%lx out of range "
+ "(gfn_remainder=0x%lx shift=%d index=0x%x max=0x%x)\n",
+ gfn, *gfn_remainder, shift, index, max);
+ return NULL;
+ }
+ *gfn_remainder &= (1 << shift) - 1;
+ return (l1_pgentry_t *)table + index;
+}
+
+// Walk one level of the P2M table, allocating a new table if required.
+// Returns 0 on error.
+//
+static int
+p2m_next_level(struct domain *d, mfn_t *table_mfn, void **table,
+ unsigned long *gfn_remainder, unsigned long gfn, u32 shift,
+ u32 max, unsigned long type)
+{
+ l1_pgentry_t *p2m_entry;
+ void *next;
+
+ if ( !(p2m_entry = p2m_find_entry(*table, gfn_remainder, gfn,
+ shift, max)) )
+ return 0;
+
+ if ( !(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) )
+ {
+ mfn_t mfn = shadow_alloc_p2m_page(d);
+ if ( mfn_x(mfn) == 0 )
+ return 0;
+ *p2m_entry = l1e_from_pfn(mfn_x(mfn), __PAGE_HYPERVISOR|_PAGE_USER);
+ mfn_to_page(mfn)->u.inuse.type_info = type | 1 | PGT_validated;
+ mfn_to_page(mfn)->count_info = 1;
+#if CONFIG_PAGING_LEVELS == 3
+ if (type == PGT_l2_page_table)
+ {
+ /* We have written to the p2m l3: need to sync the per-vcpu
+ * copies of it in the monitor tables */
+ p2m_install_entry_in_monitors(d, (l3_pgentry_t *)p2m_entry);
+ }
+#endif
+ /* The P2M can be shadowed: keep the shadows synced */
+ if ( d->vcpu[0] )
+ (void)__shadow_validate_guest_entry(d->vcpu[0], *table_mfn,
+ p2m_entry, sizeof *p2m_entry);
+ }
+ *table_mfn = _mfn(l1e_get_pfn(*p2m_entry));
+ next = sh_map_domain_page(*table_mfn);
+ sh_unmap_domain_page(*table);
+ *table = next;
+
+ return 1;
+}
+
+// Returns 0 on error (out of memory)
+int
+shadow_set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn)
+{
+ // XXX -- this might be able to be faster iff current->domain == d
+ mfn_t table_mfn = pagetable_get_mfn(d->arch.phys_table);
+ void *table = sh_map_domain_page(table_mfn);
+ unsigned long gfn_remainder = gfn;
+ l1_pgentry_t *p2m_entry;
+
+#if CONFIG_PAGING_LEVELS >= 4
+ if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
+ L4_PAGETABLE_SHIFT - PAGE_SHIFT,
+ L4_PAGETABLE_ENTRIES, PGT_l3_page_table) )
+ return 0;
+#endif
+#if CONFIG_PAGING_LEVELS >= 3
+ // When using PAE Xen, we only allow 33 bits of pseudo-physical
+ // address in translated guests (i.e. 8 GBytes). This restriction
+ // comes from wanting to map the P2M table into the 16MB RO_MPT hole
+ // in Xen's address space for translated PV guests.
+ //
+ if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
+ L3_PAGETABLE_SHIFT - PAGE_SHIFT,
+ (CONFIG_PAGING_LEVELS == 3
+ ? 8
+ : L3_PAGETABLE_ENTRIES),
+ PGT_l2_page_table) )
+ return 0;
+#endif
+ if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
+ L2_PAGETABLE_SHIFT - PAGE_SHIFT,
+ L2_PAGETABLE_ENTRIES, PGT_l1_page_table) )
+ return 0;
+
+ p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
+ 0, L1_PAGETABLE_ENTRIES);
+ ASSERT(p2m_entry);
+ if ( valid_mfn(mfn) )
+ *p2m_entry = l1e_from_pfn(mfn_x(mfn), __PAGE_HYPERVISOR|_PAGE_USER);
+ else
+ *p2m_entry = l1e_empty();
+
+ /* The P2M can be shadowed: keep the shadows synced */
+ (void) __shadow_validate_guest_entry(d->vcpu[0], table_mfn,
+ p2m_entry, sizeof *p2m_entry);
+
+ sh_unmap_domain_page(table);
+
+ return 1;
+}
+
+// Allocate a new p2m table for a domain.
+//
+// The structure of the p2m table is that of a pagetable for xen (i.e. it is
+// controlled by CONFIG_PAGING_LEVELS).
+//
+// Returns 0 if p2m table could not be initialized
+//
+static int
+shadow_alloc_p2m_table(struct domain *d)
+{
+ mfn_t p2m_top;
+ struct list_head *entry;
+ unsigned int page_count = 0;
+
+ SHADOW_PRINTK("allocating p2m table\n");
+ ASSERT(pagetable_get_pfn(d->arch.phys_table) == 0);
+
+ p2m_top = shadow_alloc_p2m_page(d);
+ mfn_to_page(p2m_top)->count_info = 1;
+ mfn_to_page(p2m_top)->u.inuse.type_info =
+#if CONFIG_PAGING_LEVELS == 4
+ PGT_l4_page_table
+#elif CONFIG_PAGING_LEVELS == 3
+ PGT_l3_page_table
+#elif CONFIG_PAGING_LEVELS == 2
+ PGT_l2_page_table
+#endif
+ | 1 | PGT_validated;
+
+ if ( mfn_x(p2m_top) == 0 )
+ return 0;
+
+ d->arch.phys_table = pagetable_from_mfn(p2m_top);
+
+ SHADOW_PRINTK("populating p2m table\n");
+
+ for ( entry = d->page_list.next;
+ entry != &d->page_list;
+ entry = entry->next )
+ {
+ struct page_info *page = list_entry(entry, struct page_info, list);
+ mfn_t mfn = page_to_mfn(page);
+ unsigned long gfn = get_gpfn_from_mfn(mfn_x(mfn));
+ page_count++;
+ if (
+#ifdef __x86_64__
+ (gfn != 0x5555555555555555L)
+#else
+ (gfn != 0x55555555L)
+#endif
+ && gfn != INVALID_M2P_ENTRY
+ && !shadow_set_p2m_entry(d, gfn, mfn) )
+ {
+ SHADOW_PRINTK("failed to initialize p2m table, gfn=%05lx, mfn=%" SH_PRI_mfn "\n",
+ gfn, mfn_x(mfn));
+ return 0;
+ }
+ }
+
+ SHADOW_PRINTK("p2m table initialised (%u pages)\n", page_count);
+ return 1;
+}
+
+mfn_t
+sh_gfn_to_mfn_foreign(struct domain *d, unsigned long gpfn)
+/* Read another domain's p2m entries */
+{
+ mfn_t mfn;
+ unsigned long addr = gpfn << PAGE_SHIFT;
+ l2_pgentry_t *l2e;
+ l1_pgentry_t *l1e;
+
+ ASSERT(shadow_mode_translate(d));
+ mfn = pagetable_get_mfn(d->arch.phys_table);
+
+
+#if CONFIG_PAGING_LEVELS > 2
+ if ( gpfn > (RO_MPT_VIRT_END - RO_MPT_VIRT_START) / sizeof(l1_pgentry_t) )
+ /* This pfn is higher than the p2m map can hold */
+ return _mfn(INVALID_MFN);
+#endif
+
+
+#if CONFIG_PAGING_LEVELS >= 4
+ {
+ l4_pgentry_t *l4e = sh_map_domain_page(mfn);
+ l4e += l4_table_offset(addr);
+ if ( (l4e_get_flags(*l4e) & _PAGE_PRESENT) == 0 )
+ {
+ sh_unmap_domain_page(l4e);
+ return _mfn(INVALID_MFN);
+ }
+ mfn = _mfn(l4e_get_pfn(*l4e));
+ sh_unmap_domain_page(l4e);
+ }
+#endif
+#if CONFIG_PAGING_LEVELS >= 3
+ {
+ l3_pgentry_t *l3e = sh_map_domain_page(mfn);
+ l3e += l3_table_offset(addr);
+ if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 )
+ {
+ sh_unmap_domain_page(l3e);
+ return _mfn(INVALID_MFN);
+ }
+ mfn = _mfn(l3e_get_pfn(*l3e));
+ sh_unmap_domain_page(l3e);
+ }
+#endif
+
+ l2e = sh_map_domain_page(mfn);
+ l2e += l2_table_offset(addr);
+ if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 )
+ {
+ sh_unmap_domain_page(l2e);
+ return _mfn(INVALID_MFN);
+ }
+ mfn = _mfn(l2e_get_pfn(*l2e));
+ sh_unmap_domain_page(l2e);
+
+ l1e = sh_map_domain_page(mfn);
+ l1e += l1_table_offset(addr);
+ if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 )
+ {
+ sh_unmap_domain_page(l1e);
+ return _mfn(INVALID_MFN);
+ }
+ mfn = _mfn(l1e_get_pfn(*l1e));
+ sh_unmap_domain_page(l1e);
+
+ return mfn;
+}
+
+unsigned long
+shadow_gfn_to_mfn_foreign(unsigned long gpfn)
+{
+ return mfn_x(sh_gfn_to_mfn_foreign(current->domain, gpfn));
+}
+
+
+static void shadow_p2m_teardown(struct domain *d)
+/* Return all the p2m pages to Xen.
+ * We know we don't have any extra mappings to these pages */
+{
+ struct list_head *entry, *n;
+ struct page_info *pg;
+
+ d->arch.phys_table = pagetable_null();
+
+ list_for_each_safe(entry, n, &d->arch.shadow.p2m_inuse)
+ {
+ pg = list_entry(entry, struct page_info, list);
+ list_del(entry);
+ /* Should have just the one ref we gave it in alloc_p2m_page() */
+ if ( (pg->count_info & PGC_SH_count_mask) != 1 )
+ {
+ SHADOW_PRINTK("Odd p2m page count c=%#x t=%"PRtype_info"\n",
+ pg->count_info, pg->u.inuse.type_info);
+ }
+ ASSERT(page_get_owner(pg) == d);
+ /* Free should not decrement domain's total allocation, since
+ * these pages were allocated without an owner. */
+ page_set_owner(pg, NULL);
+ free_domheap_pages(pg, 0);
+ d->arch.shadow.p2m_pages--;
+ perfc_decr(shadow_alloc_count);
+ }
+ list_for_each_safe(entry, n, &d->arch.shadow.p2m_freelist)
+ {
+ list_del(entry);
+ pg = list_entry(entry, struct page_info, list);
+ ASSERT(page_get_owner(pg) == d);
+ /* Free should not decrement domain's total allocation. */
+ page_set_owner(pg, NULL);
+ free_domheap_pages(pg, 0);
+ d->arch.shadow.p2m_pages--;
+ perfc_decr(shadow_alloc_count);
+ }
+ ASSERT(d->arch.shadow.p2m_pages == 0);
+}
+
+/* Set the pool of shadow pages to the required number of pages.
+ * Input will be rounded up to at least shadow_min_acceptable_pages(),
+ * plus space for the p2m table.
+ * Returns 0 for success, non-zero for failure. */
+static unsigned int set_sh_allocation(struct domain *d,
+ unsigned int pages,
+ int *preempted)
+{
+ struct page_info *pg;
+ unsigned int lower_bound;
+ int j;
+
+ ASSERT(shadow_lock_is_acquired(d));
+
+ /* Don't allocate less than the minimum acceptable, plus one page per
+ * megabyte of RAM (for the p2m table) */
+ lower_bound = shadow_min_acceptable_pages(d) + (d->tot_pages / 256);
+ if ( pages > 0 && pages < lower_bound )
+ pages = lower_bound;
+ /* Round up to largest block size */
+ pages = (pages + ((1<<SHADOW_MAX_ORDER)-1)) & ~((1<<SHADOW_MAX_ORDER)-1);
+
+ SHADOW_PRINTK("current %i target %i\n",
+ d->arch.shadow.total_pages, pages);
+
+ while ( d->arch.shadow.total_pages != pages )
+ {
+ if ( d->arch.shadow.total_pages < pages )
+ {
+ /* Need to allocate more memory from domheap */
+ pg = alloc_domheap_pages(NULL, SHADOW_MAX_ORDER, 0);
+ if ( pg == NULL )
+ {
+ SHADOW_PRINTK("failed to allocate shadow pages.\n");
+ return -ENOMEM;
+ }
+ d->arch.shadow.free_pages += 1<<SHADOW_MAX_ORDER;
+ d->arch.shadow.total_pages += 1<<SHADOW_MAX_ORDER;
+ for ( j = 0; j < 1<<SHADOW_MAX_ORDER; j++ )
+ {
+ pg[j].u.inuse.type_info = 0; /* Free page */
+ pg[j].tlbflush_timestamp = 0; /* Not in any TLB */
+ }
+ SH_SET_PFN_ORDER(pg, SHADOW_MAX_ORDER);
+ list_add_tail(&pg->list,
+ &d->arch.shadow.freelists[SHADOW_MAX_ORDER]);
+ }
+ else if ( d->arch.shadow.total_pages > pages )
+ {
+ /* Need to return memory to domheap */
+ shadow_prealloc(d, SHADOW_MAX_ORDER);
+ ASSERT(!list_empty(&d->arch.shadow.freelists[SHADOW_MAX_ORDER]));
+ pg = list_entry(d->arch.shadow.freelists[SHADOW_MAX_ORDER].next,
+ struct page_info, list);
+ list_del(&pg->list);
+ d->arch.shadow.free_pages -= 1<<SHADOW_MAX_ORDER;
+ d->arch.shadow.total_pages -= 1<<SHADOW_MAX_ORDER;
+ free_domheap_pages(pg, SHADOW_MAX_ORDER);
+ }
+
+ /* Check to see if we need to yield and try again */
+ if ( preempted && hypercall_preempt_check() )
+ {
+ *preempted = 1;
+ return 0;
+ }
+ }
+
+ return 0;
+}
+
+unsigned int shadow_set_allocation(struct domain *d,
+ unsigned int megabytes,
+ int *preempted)
+/* Hypercall interface to set the shadow memory allocation */
+{
+ unsigned int rv;
+ shadow_lock(d);
+ rv = set_sh_allocation(d, megabytes << (20 - PAGE_SHIFT), preempted);
+ SHADOW_PRINTK("dom %u allocation now %u pages (%u MB)\n",
+ d->domain_id,
+ d->arch.shadow.total_pages,
+ shadow_get_allocation(d));
+ shadow_unlock(d);
+ return rv;
+}
+
+/**************************************************************************/
+/* Hash table for storing the guest->shadow mappings */
+
+/* Hash function that takes a gfn or mfn, plus another byte of type info */
+typedef u32 key_t;
+static inline key_t sh_hash(unsigned long n, u8 t)
+{
+ unsigned char *p = (unsigned char *)&n;
+ key_t k = t;
+ int i;
+ for ( i = 0; i < sizeof(n) ; i++ ) k = (u32)p[i] + (k<<6) + (k<<16) - k;
+ return k;
+}
+
+#if SHADOW_AUDIT & (SHADOW_AUDIT_HASH|SHADOW_AUDIT_HASH_FULL)
+
+/* Before we get to the mechanism, define a pair of audit functions
+ * that sanity-check the contents of the hash table. */
+static void sh_hash_audit_bucket(struct domain *d, int bucket)
+/* Audit one bucket of the hash table */
+{
+ struct shadow_hash_entry *e, *x;
+ struct page_info *pg;
+
+ if ( !(SHADOW_AUDIT_ENABLE) )
+ return;
+
+ e = &d->arch.shadow.hash_table[bucket];
+ if ( e->t == 0 ) return; /* Bucket is empty */
+ while ( e )
+ {
+ /* Empty link? */
+ BUG_ON( e->t == 0 );
+ /* Bogus type? */
+ BUG_ON( e->t > (PGC_SH_max_shadow >> PGC_SH_type_shift) );
+ /* Wrong bucket? */
+ BUG_ON( sh_hash(e->n, e->t) % SHADOW_HASH_BUCKETS != bucket );
+ /* Duplicate entry? */
+ for ( x = e->next; x; x = x->next )
+ BUG_ON( x->n == e->n && x->t == e->t );
+ /* Bogus MFN? */
+ BUG_ON( !valid_mfn(e->smfn) );
+ pg = mfn_to_page(e->smfn);
+ /* Not a shadow? */
+ BUG_ON( page_get_owner(pg) != 0 );
+ /* Wrong kind of shadow? */
+ BUG_ON( (pg->count_info & PGC_SH_type_mask) >> PGC_SH_type_shift
+ != e->t );
+ /* Bad backlink? */
+ BUG_ON( pg->u.inuse.type_info != e->n );
+ if ( e->t != (PGC_SH_fl1_32_shadow >> PGC_SH_type_shift)
+ && e->t != (PGC_SH_fl1_pae_shadow >> PGC_SH_type_shift)
+ && e->t != (PGC_SH_fl1_64_shadow >> PGC_SH_type_shift) )
+ {
+ /* Bad shadow flags on guest page? */
+ BUG_ON( !(mfn_to_page(_mfn(e->n))->shadow_flags & (1<<e->t)) );
+ }
+ /* That entry was OK; on we go */
+ e = e->next;
+ }
+}
+
+#else
+#define sh_hash_audit_bucket(_d, _b)
+#endif /* Hashtable bucket audit */
+
+
+#if SHADOW_AUDIT & SHADOW_AUDIT_HASH_FULL
+
+static void sh_hash_audit(struct domain *d)
+/* Full audit: audit every bucket in the table */
+{
+ int i;
+
+ if ( !(SHADOW_AUDIT_ENABLE) )
+ return;
+
+ for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ )
+ {
+ sh_hash_audit_bucket(d, i);
+ }
+}
+
+#else
+#define sh_hash_audit(_d)
+#endif /* Hashtable bucket audit */
+
+/* Memory management interface for bucket allocation.
+ * These ought to come out of shadow memory, but at least on 32-bit
+ * machines we are forced to allocate them from xenheap so that we can
+ * address them. */
+static struct shadow_hash_entry *sh_alloc_hash_entry(struct domain *d)
+{
+ struct shadow_hash_entry *extra, *x;
+ int i;
+
+ /* We need to allocate a new node. Ensure the free list is not empty.
+ * Allocate new entries in units the same size as the original table. */
+ if ( unlikely(d->arch.shadow.hash_freelist == NULL) )
+ {
+ size_t sz = sizeof(void *) + (SHADOW_HASH_BUCKETS * sizeof(*x));
+ extra = xmalloc_bytes(sz);
+
+ if ( extra == NULL )
+ {
+ /* No memory left! */
+ SHADOW_ERROR("xmalloc() failed when allocating hash buckets.\n");
+ domain_crash_synchronous();
+ }
+ memset(extra, 0, sz);
+
+ /* Record the allocation block so it can be correctly freed later. */
+ *((struct shadow_hash_entry **)&extra[SHADOW_HASH_BUCKETS]) =
+ d->arch.shadow.hash_allocations;
+ d->arch.shadow.hash_allocations = &extra[0];
+
+ /* Thread a free chain through the newly-allocated nodes. */
+ for ( i = 0; i < (SHADOW_HASH_BUCKETS - 1); i++ )
+ extra[i].next = &extra[i+1];
+ extra[i].next = NULL;
+
+ /* Add the new nodes to the free list. */
+ d->arch.shadow.hash_freelist = &extra[0];
+ }
+
+ /* Allocate a new node from the free list. */
+ x = d->arch.shadow.hash_freelist;
+ d->arch.shadow.hash_freelist = x->next;
+ return x;
+}
+
+static void sh_free_hash_entry(struct domain *d, struct shadow_hash_entry *e)
+{
+ /* Mark the bucket as empty and return it to the free list */
+ e->t = 0;
+ e->next = d->arch.shadow.hash_freelist;
+ d->arch.shadow.hash_freelist = e;
+}
+
+
+/* Allocate and initialise the table itself.
+ * Returns 0 for success, 1 for error. */
+static int shadow_hash_alloc(struct domain *d)
+{
+ struct shadow_hash_entry *table;
+
+ ASSERT(shadow_lock_is_acquired(d));
+ ASSERT(!d->arch.shadow.hash_table);
+
+ table = xmalloc_array(struct shadow_hash_entry, SHADOW_HASH_BUCKETS);
+ if ( !table ) return 1;
+ memset(table, 0,
+ SHADOW_HASH_BUCKETS * sizeof (struct shadow_hash_entry));
+ d->arch.shadow.hash_table = table;
+ return 0;
+}
+
+/* Tear down the hash table and return all memory to Xen.
+ * This function does not care whether the table is populated. */
+static void shadow_hash_teardown(struct domain *d)
+{
+ struct shadow_hash_entry *a, *n;
+
+ ASSERT(shadow_lock_is_acquired(d));
+ ASSERT(d->arch.shadow.hash_table);
+
+ /* Return the table itself */
+ xfree(d->arch.shadow.hash_table);
+ d->arch.shadow.hash_table = NULL;
+
+ /* Return any extra allocations */
+ a = d->arch.shadow.hash_allocations;
+ while ( a )
+ {
+ /* We stored a linked-list pointer at the end of each allocation */
+ n = *((struct shadow_hash_entry **)(&a[SHADOW_HASH_BUCKETS]));
+ xfree(a);
+ a = n;
+ }
+ d->arch.shadow.hash_allocations = NULL;
+ d->arch.shadow.hash_freelist = NULL;
+}
+
+
+mfn_t shadow_hash_lookup(struct vcpu *v, unsigned long n, u8 t)
+/* Find an entry in the hash table. Returns the MFN of the shadow,
+ * or INVALID_MFN if it doesn't exist */
+{
+ struct domain *d = v->domain;
+ struct shadow_hash_entry *p, *x, *head;
+ key_t key;
+
+ ASSERT(shadow_lock_is_acquired(d));
+ ASSERT(d->arch.shadow.hash_table);
+ ASSERT(t);
+
+ sh_hash_audit(d);
+
+ perfc_incrc(shadow_hash_lookups);
+ key = sh_hash(n, t);
+
+ x = head = &d->arch.shadow.hash_table[key % SHADOW_HASH_BUCKETS];
+ p = NULL;
+
+ sh_hash_audit_bucket(d, key % SHADOW_HASH_BUCKETS);
+
+ do
+ {
+ ASSERT(x->t || ((x == head) && (x->next == NULL)));
+
+ if ( x->n == n && x->t == t )
+ {
+ /* Pull-to-front if 'x' isn't already the head item */
+ if ( unlikely(x != head) )
+ {
+ if ( unlikely(d->arch.shadow.hash_walking != 0) )
+ /* Can't reorder: someone is walking the hash chains */
+ return x->smfn;
+ else
+ {
+ /* Delete 'x' from list and reinsert after head. */
+ p->next = x->next;
+ x->next = head->next;
+ head->next = x;
+
+ /* Swap 'x' contents with head contents. */
+ SWAP(head->n, x->n);
+ SWAP(head->t, x->t);
+ SWAP(head->smfn, x->smfn);
+ }
+ }
+ else
+ {
+ perfc_incrc(shadow_hash_lookup_head);
+ }
+ return head->smfn;
+ }
+
+ p = x;
+ x = x->next;
+ }
+ while ( x != NULL );
+
+ perfc_incrc(shadow_hash_lookup_miss);
+ return _mfn(INVALID_MFN);
+}
+
+void shadow_hash_insert(struct vcpu *v, unsigned long n, u8 t, mfn_t smfn)
+/* Put a mapping (n,t)->smfn into the hash table */
+{
+ struct domain *d = v->domain;
+ struct shadow_hash_entry *x, *head;
+ key_t key;
+
+ ASSERT(shadow_lock_is_acquired(d));
+ ASSERT(d->arch.shadow.hash_table);
+ ASSERT(t);
+
+ sh_hash_audit(d);
+
+ perfc_incrc(shadow_hash_inserts);
+ key = sh_hash(n, t);
+
+ head = &d->arch.shadow.hash_table[key % SHADOW_HASH_BUCKETS];
+
+ sh_hash_audit_bucket(d, key % SHADOW_HASH_BUCKETS);
+
+ /* If the bucket is empty then insert the new page as the head item. */
+ if ( head->t == 0 )
+ {
+ head->n = n;
+ head->t = t;
+ head->smfn = smfn;
+ ASSERT(head->next == NULL);
+ }
+ else
+ {
+ /* Insert a new entry directly after the head item. */
+ x = sh_alloc_hash_entry(d);
+ x->n = n;
+ x->t = t;
+ x->smfn = smfn;
+ x->next = head->next;
+ head->next = x;
+ }
+
+ sh_hash_audit_bucket(d, key % SHADOW_HASH_BUCKETS);
+}
+
+void shadow_hash_delete(struct vcpu *v, unsigned long n, u8 t, mfn_t smfn)
+/* Excise the mapping (n,t)->smfn from the hash table */
+{
+ struct domain *d = v->domain;
+ struct shadow_hash_entry *p, *x, *head;
+ key_t key;
+
+ ASSERT(shadow_lock_is_acquired(d));
+ ASSERT(d->arch.shadow.hash_table);
+ ASSERT(t);
+
+ sh_hash_audit(d);
+
+ perfc_incrc(shadow_hash_deletes);
+ key = sh_hash(n, t);
+
+ head = &d->arch.shadow.hash_table[key % SHADOW_HASH_BUCKETS];
+
+ sh_hash_audit_bucket(d, key % SHADOW_HASH_BUCKETS);
+
+ /* Match on head item? */
+ if ( head->n == n && head->t == t )
+ {
+ if ( (x = head->next) != NULL )
+ {
+ /* Overwrite head with contents of following node. */
+ head->n = x->n;
+ head->t = x->t;
+ head->smfn = x->smfn;
+
+ /* Delete following node. */
+ head->next = x->next;
+ sh_free_hash_entry(d, x);
+ }
+ else
+ {
+ /* This bucket is now empty. Initialise the head node. */
+ head->t = 0;
+ }
+ }
+ else
+ {
+ /* Not at the head; need to walk the chain */
+ p = head;
+ x = head->next;
+
+ while(1)
+ {
+ ASSERT(x); /* We can't have hit the end, since our target is
+ * still in the chain somehwere... */
+ if ( x->n == n && x->t == t )
+ {
+ /* Delete matching node. */
+ p->next = x->next;
+ sh_free_hash_entry(d, x);
+ break;
+ }
+ p = x;
+ x = x->next;
+ }
+ }
+
+ sh_hash_audit_bucket(d, key % SHADOW_HASH_BUCKETS);
+}
+
+typedef int (*hash_callback_t)(struct vcpu *v, mfn_t smfn, mfn_t other_mfn);
+
+static void hash_foreach(struct vcpu *v,
+ unsigned int callback_mask,
+ hash_callback_t callbacks[],
+ mfn_t callback_mfn)
+/* Walk the hash table looking at the types of the entries and
+ * calling the appropriate callback function for each entry.
+ * The mask determines which shadow types we call back for, and the array
+ * of callbacks tells us which function to call.
+ * Any callback may return non-zero to let us skip the rest of the scan.
+ *
+ * WARNING: Callbacks MUST NOT add or remove hash entries unless they
+ * then return non-zero to terminate the scan. */
+{
+ int i, done = 0;
+ struct domain *d = v->domain;
+ struct shadow_hash_entry *x;
+
+ /* Say we're here, to stop hash-lookups reordering the chains */
+ ASSERT(shadow_lock_is_acquired(d));
+ ASSERT(d->arch.shadow.hash_walking == 0);
+ d->arch.shadow.hash_walking = 1;
+
+ callback_mask &= ~1; /* Never attempt to call back on empty buckets */
+ for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ )
+ {
+ /* WARNING: This is not safe against changes to the hash table.
+ * The callback *must* return non-zero if it has inserted or
+ * deleted anything from the hash (lookups are OK, though). */
+ for ( x = &d->arch.shadow.hash_table[i]; x; x = x->next )
+ {
+ if ( callback_mask & (1 << x->t) )
+ {
+ ASSERT(x->t <= 15);
+ ASSERT(callbacks[x->t] != NULL);
+ if ( (done = callbacks[x->t](v, x->smfn, callback_mfn)) != 0 )
+ break;
+ }
+ }
+ if ( done ) break;
+ }
+ d->arch.shadow.hash_walking = 0;
+}
+
+
+/**************************************************************************/
+/* Destroy a shadow page: simple dispatcher to call the per-type destructor
+ * which will decrement refcounts appropriately and return memory to the
+ * free pool. */
+
+void sh_destroy_shadow(struct vcpu *v, mfn_t smfn)
+{
+ struct page_info *pg = mfn_to_page(smfn);
+ u32 t = pg->count_info & PGC_SH_type_mask;
+
+
+ SHADOW_PRINTK("smfn=%#lx\n", mfn_x(smfn));
+
+ /* Double-check, if we can, that the shadowed page belongs to this
+ * domain, (by following the back-pointer). */
+ ASSERT(t == PGC_SH_fl1_32_shadow ||
+ t == PGC_SH_fl1_pae_shadow ||
+ t == PGC_SH_fl1_64_shadow ||
+ t == PGC_SH_monitor_table ||
+ (page_get_owner(mfn_to_page(_mfn(pg->u.inuse.type_info)))
+ == v->domain));
+
+ /* The down-shifts here are so that the switch statement is on nice
+ * small numbers that the compiler will enjoy */
+ switch ( t >> PGC_SH_type_shift )
+ {
+#if CONFIG_PAGING_LEVELS == 2
+ case PGC_SH_l1_32_shadow >> PGC_SH_type_shift:
+ case PGC_SH_fl1_32_shadow >> PGC_SH_type_shift:
+ SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 2, 2)(v, smfn);
+ break;
+ case PGC_SH_l2_32_shadow >> PGC_SH_type_shift:
+ SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 2, 2)(v, smfn);
+ break;
+#else /* PAE or 64bit */
+ case PGC_SH_l1_32_shadow >> PGC_SH_type_shift:
+ case PGC_SH_fl1_32_shadow >> PGC_SH_type_shift:
+ SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 3, 2)(v, smfn);
+ break;
+ case PGC_SH_l2_32_shadow >> PGC_SH_type_shift:
+ SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 3, 2)(v, smfn);
+ break;
+#endif
+
+#if CONFIG_PAGING_LEVELS >= 3
+ case PGC_SH_l1_pae_shadow >> PGC_SH_type_shift:
+ case PGC_SH_fl1_pae_shadow >> PGC_SH_type_shift:
+ SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 3, 3)(v, smfn);
+ break;
+ case PGC_SH_l2_pae_shadow >> PGC_SH_type_shift:
+ case PGC_SH_l2h_pae_shadow >> PGC_SH_type_shift:
+ SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 3, 3)(v, smfn);
+ break;
+ case PGC_SH_l3_pae_shadow >> PGC_SH_type_shift:
+ SHADOW_INTERNAL_NAME(sh_destroy_l3_shadow, 3, 3)(v, smfn);
+ break;
+#endif
+
+#if CONFIG_PAGING_LEVELS >= 4
+ case PGC_SH_l1_64_shadow >> PGC_SH_type_shift:
+ case PGC_SH_fl1_64_shadow >> PGC_SH_type_shift:
+ SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 4, 4)(v, smfn);
+ break;
+ case PGC_SH_l2_64_shadow >> PGC_SH_type_shift:
+ SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 4, 4)(v, smfn);
+ break;
+ case PGC_SH_l3_64_shadow >> PGC_SH_type_shift:
+ SHADOW_INTERNAL_NAME(sh_destroy_l3_shadow, 4, 4)(v, smfn);
+ break;
+ case PGC_SH_l4_64_shadow >> PGC_SH_type_shift:
+ SHADOW_INTERNAL_NAME(sh_destroy_l4_shadow, 4, 4)(v, smfn);
+ break;
+#endif
+ default:
+ SHADOW_PRINTK("tried to destroy shadow of bad type %08lx\n",
+ (unsigned long)t);
+ BUG();
+ }
+}
+
+/**************************************************************************/
+/* Remove all writeable mappings of a guest frame from the shadow tables
+ * Returns non-zero if we need to flush TLBs.
+ * level and fault_addr desribe how we found this to be a pagetable;
+ * level==0 means we have some other reason for revoking write access.*/
+
+int shadow_remove_write_access(struct vcpu *v, mfn_t gmfn,
+ unsigned int level,
+ unsigned long fault_addr)
+{
+ /* Dispatch table for getting per-type functions */
+ static hash_callback_t callbacks[16] = {
+ NULL, /* none */
+#if CONFIG_PAGING_LEVELS == 2
+ SHADOW_INTERNAL_NAME(sh_remove_write_access,2,2), /* l1_32 */
+ SHADOW_INTERNAL_NAME(sh_remove_write_access,2,2), /* fl1_32 */
+#else
+ SHADOW_INTERNAL_NAME(sh_remove_write_access,3,2), /* l1_32 */
+ SHADOW_INTERNAL_NAME(sh_remove_write_access,3,2), /* fl1_32 */
+#endif
+ NULL, /* l2_32 */
+#if CONFIG_PAGING_LEVELS >= 3
+ SHADOW_INTERNAL_NAME(sh_remove_write_access,3,3), /* l1_pae */
+ SHADOW_INTERNAL_NAME(sh_remove_write_access,3,3), /* fl1_pae */
+#else
+ NULL, /* l1_pae */
+ NULL, /* fl1_pae */
+#endif
+ NULL, /* l2_pae */
+ NULL, /* l2h_pae */
+ NULL, /* l3_pae */
+#if CONFIG_PAGING_LEVELS >= 4
+ SHADOW_INTERNAL_NAME(sh_remove_write_access,4,4), /* l1_64 */
+ SHADOW_INTERNAL_NAME(sh_remove_write_access,4,4), /* fl1_64 */
+#else
+ NULL, /* l1_64 */
+ NULL, /* fl1_64 */
+#endif
+ NULL, /* l2_64 */
+ NULL, /* l3_64 */
+ NULL, /* l4_64 */
+ NULL, /* p2m */
+ NULL /* unused */
+ };
+
+ static unsigned int callback_mask =
+ 1 << (PGC_SH_l1_32_shadow >> PGC_SH_type_shift)
+ | 1 << (PGC_SH_fl1_32_shadow >> PGC_SH_type_shift)
+ | 1 << (PGC_SH_l1_pae_shadow >> PGC_SH_type_shift)
+ | 1 << (PGC_SH_fl1_pae_shadow >> PGC_SH_type_shift)
+ | 1 << (PGC_SH_l1_64_shadow >> PGC_SH_type_shift)
+ | 1 << (PGC_SH_fl1_64_shadow >> PGC_SH_type_shift)
+ ;
+ struct page_info *pg = mfn_to_page(gmfn);
+
+ ASSERT(shadow_lock_is_acquired(v->domain));
+
+ /* Only remove writable mappings if we are doing shadow refcounts.
+ * In guest refcounting, we trust Xen to already be restricting
+ * all the writes to the guest page tables, so we do not need to
+ * do more. */
+ if ( !shadow_mode_refcounts(v->domain) )
+ return 0;
+
+ /* Early exit if it's already a pagetable, or otherwise not writeable */
+ if ( sh_mfn_is_a_page_table(gmfn)
+ || (pg->u.inuse.type_info & PGT_count_mask) == 0 )
+ return 0;
+
+ perfc_incrc(shadow_writeable);
+
+ /* If this isn't a "normal" writeable page, the domain is trying to
+ * put pagetables in special memory of some kind. We can't allow that. */
+ if ( (pg->u.inuse.type_info & PGT_type_mask) != PGT_writable_page )
+ {
+ SHADOW_ERROR("can't remove write access to mfn %lx, type_info is %"
+ PRtype_info "\n",
+ mfn_x(gmfn), mfn_to_page(gmfn)->u.inuse.type_info);
+ domain_crash(v->domain);
+ }
+
+#if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
+ if ( v == current && level != 0 )
+ {
+ unsigned long gfn;
+ /* Heuristic: there is likely to be only one writeable mapping,
+ * and that mapping is likely to be in the current pagetable,
+ * either in the guest's linear map (linux, windows) or in a
+ * magic slot used to map high memory regions (linux HIGHTPTE) */
+
+#define GUESS(_a, _h) do { \
+ if ( v->arch.shadow.mode->guess_wrmap(v, (_a), gmfn) ) \
+ perfc_incrc(shadow_writeable_h_ ## _h); \
+ if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 ) \
+ return 1; \
+ } while (0)
+
+
+ /* Linux lowmem: first 1GB is mapped 1-to-1 above 0xC0000000 */
+ if ( v == current
+ && (gfn = sh_mfn_to_gfn(v->domain, gmfn)) < 0x40000000 )
+ GUESS(0xC0000000 + (gfn << PAGE_SHIFT), 4);
+
+ if ( v->arch.shadow.mode->guest_levels == 2 )
+ {
+ if ( level == 1 )
+ /* 32bit non-PAE w2k3: linear map at 0xC0000000 */
+ GUESS(0xC0000000UL + (fault_addr >> 10), 1);
+ }
+#if CONFIG_PAGING_LEVELS >= 3
+ else if ( v->arch.shadow.mode->guest_levels == 3 )
+ {
+ /* 32bit PAE w2k3: linear map at 0xC0000000 */
+ switch ( level )
+ {
+ case 1: GUESS(0xC0000000UL + (fault_addr >> 9), 2); break;
+ case 2: GUESS(0xC0600000UL + (fault_addr >> 18), 2); break;
+ }
+ }
+#if CONFIG_PAGING_LEVELS >= 4
+ else if ( v->arch.shadow.mode->guest_levels == 4 )
+ {
+ /* 64bit w2k3: linear map at 0x0000070000000000 */
+ switch ( level )
+ {
+ case 1: GUESS(0x70000000000UL + (fault_addr >> 9), 3); break;
+ case 2: GUESS(0x70380000000UL + (fault_addr >> 18), 3); break;
+ case 3: GUESS(0x70381C00000UL + (fault_addr >> 27), 3); break;
+ }
+ }
+#endif /* CONFIG_PAGING_LEVELS >= 4 */
+#endif /* CONFIG_PAGING_LEVELS >= 3 */
+
+#undef GUESS
+
+ }
+#endif
+
+ /* Brute-force search of all the shadows, by walking the hash */
+ perfc_incrc(shadow_writeable_bf);
+ hash_foreach(v, callback_mask, callbacks, gmfn);
+
+ /* If that didn't catch the mapping, something is very wrong */
+ if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) != 0 )
+ {
+ SHADOW_ERROR("can't find all writeable mappings of mfn %lx: "
+ "%lu left\n", mfn_x(gmfn),
+ (mfn_to_page(gmfn)->u.inuse.type_info&PGT_count_mask));
+ domain_crash(v->domain);
+ }
+
+ /* We killed at least one writeable mapping, so must flush TLBs. */
+ return 1;
+}
+
+
+
+/**************************************************************************/
+/* Remove all mappings of a guest frame from the shadow tables.
+ * Returns non-zero if we need to flush TLBs. */
+
+int shadow_remove_all_mappings(struct vcpu *v, mfn_t gmfn)
+{
+ struct page_info *page = mfn_to_page(gmfn);
+ int expected_count;
+
+ /* Dispatch table for getting per-type functions */
+ static hash_callback_t callbacks[16] = {
+ NULL, /* none */
+#if CONFIG_PAGING_LEVELS == 2
+ SHADOW_INTERNAL_NAME(sh_remove_all_mappings,2,2), /* l1_32 */
+ SHADOW_INTERNAL_NAME(sh_remove_all_mappings,2,2), /* fl1_32 */
+#else
+ SHADOW_INTERNAL_NAME(sh_remove_all_mappings,3,2), /* l1_32 */
+ SHADOW_INTERNAL_NAME(sh_remove_all_mappings,3,2), /* fl1_32 */
+#endif
+ NULL, /* l2_32 */
+#if CONFIG_PAGING_LEVELS >= 3
+ SHADOW_INTERNAL_NAME(sh_remove_all_mappings,3,3), /* l1_pae */
+ SHADOW_INTERNAL_NAME(sh_remove_all_mappings,3,3), /* fl1_pae */
+#else
+ NULL, /* l1_pae */
+ NULL, /* fl1_pae */
+#endif
+ NULL, /* l2_pae */
+ NULL, /* l2h_pae */
+ NULL, /* l3_pae */
+#if CONFIG_PAGING_LEVELS >= 4
+ SHADOW_INTERNAL_NAME(sh_remove_all_mappings,4,4), /* l1_64 */
+ SHADOW_INTERNAL_NAME(sh_remove_all_mappings,4,4), /* fl1_64 */
+#else
+ NULL, /* l1_64 */
+ NULL, /* fl1_64 */
+#endif
+ NULL, /* l2_64 */
+ NULL, /* l3_64 */
+ NULL, /* l4_64 */
+ NULL, /* p2m */
+ NULL /* unused */
+ };
+
+ static unsigned int callback_mask =
+ 1 << (PGC_SH_l1_32_shadow >> PGC_SH_type_shift)
+ | 1 << (PGC_SH_fl1_32_shadow >> PGC_SH_type_shift)
+ | 1 << (PGC_SH_l1_pae_shadow >> PGC_SH_type_shift)
+ | 1 << (PGC_SH_fl1_pae_shadow >> PGC_SH_type_shift)
+ | 1 << (PGC_SH_l1_64_shadow >> PGC_SH_type_shift)
+ | 1 << (PGC_SH_fl1_64_shadow >> PGC_SH_type_shift)
+ ;
+
+ perfc_incrc(shadow_mappings);
+ if ( (page->count_info & PGC_count_mask) == 0 )
+ return 0;
+
+ ASSERT(shadow_lock_is_acquired(v->domain));
+
+ /* XXX TODO:
+ * Heuristics for finding the (probably) single mapping of this gmfn */
+
+ /* Brute-force search of all the shadows, by walking the hash */
+ perfc_incrc(shadow_mappings_bf);
+ hash_foreach(v, callback_mask, callbacks, gmfn);
+
+ /* If that didn't catch the mapping, something is very wrong */
+ expected_count = (page->count_info & PGC_allocated) ? 1 : 0;
+ if ( (page->count_info & PGC_count_mask) != expected_count )
+ {
+ /* Don't complain if we're in HVM and there's one extra mapping:
+ * The qemu helper process has an untyped mapping of this dom's RAM */
+ if ( !(shadow_mode_external(v->domain)
+ && (page->count_info & PGC_count_mask) <= 2
+ && (page->u.inuse.type_info & PGT_count_mask) == 0) )
+ {
+ SHADOW_ERROR("can't find all mappings of mfn %lx: "
+ "c=%08x t=%08lx\n", mfn_x(gmfn),
+ page->count_info, page->u.inuse.type_info);
+ }
+ }
+
+ /* We killed at least one mapping, so must flush TLBs. */
+ return 1;
+}
+
+
+/**************************************************************************/
+/* Remove all shadows of a guest frame from the shadow tables */
+
+static int sh_remove_shadow_via_pointer(struct vcpu *v, mfn_t smfn)
+/* Follow this shadow's up-pointer, if it has one, and remove the reference
+ * found there. Returns 1 if that was the only reference to this shadow */
+{
+ struct page_info *pg = mfn_to_page(smfn);
+ mfn_t pmfn;
+ void *vaddr;
+ int rc;
+
+ ASSERT((pg->count_info & PGC_SH_type_mask) > 0);
+ ASSERT((pg->count_info & PGC_SH_type_mask) < PGC_SH_max_shadow);
+ ASSERT((pg->count_info & PGC_SH_type_mask) != PGC_SH_l2_32_shadow);
+ ASSERT((pg->count_info & PGC_SH_type_mask) != PGC_SH_l3_pae_shadow);
+ ASSERT((pg->count_info & PGC_SH_type_mask) != PGC_SH_l4_64_shadow);
+
+ if (pg->up == 0) return 0;
+ pmfn = _mfn(pg->up >> PAGE_SHIFT);
+ ASSERT(valid_mfn(pmfn));
+ vaddr = sh_map_domain_page(pmfn);
+ ASSERT(vaddr);
+ vaddr += pg->up & (PAGE_SIZE-1);
+ ASSERT(l1e_get_pfn(*(l1_pgentry_t *)vaddr) == mfn_x(smfn));
+
+ /* Is this the only reference to this shadow? */
+ rc = ((pg->count_info & PGC_SH_count_mask) == 1) ? 1 : 0;
+
+ /* Blank the offending entry */
+ switch ((pg->count_info & PGC_SH_type_mask))
+ {
+ case PGC_SH_l1_32_shadow:
+ case PGC_SH_l2_32_shadow:
+#if CONFIG_PAGING_LEVELS == 2
+ SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,2,2)(v, vaddr, pmfn);
+#else
+ SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,3,2)(v, vaddr, pmfn);
+#endif
+ break;
+#if CONFIG_PAGING_LEVELS >=3
+ case PGC_SH_l1_pae_shadow:
+ case PGC_SH_l2_pae_shadow:
+ case PGC_SH_l2h_pae_shadow:
+ case PGC_SH_l3_pae_shadow:
+ SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,3,3)(v, vaddr, pmfn);
+ break;
+#if CONFIG_PAGING_LEVELS >= 4
+ case PGC_SH_l1_64_shadow:
+ case PGC_SH_l2_64_shadow:
+ case PGC_SH_l3_64_shadow:
+ case PGC_SH_l4_64_shadow:
+ SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,4,4)(v, vaddr, pmfn);
+ break;
+#endif
+#endif
+ default: BUG(); /* Some wierd unknown shadow type */
+ }
+
+ sh_unmap_domain_page(vaddr);
+ if ( rc )
+ perfc_incrc(shadow_up_pointer);
+ else
+ perfc_incrc(shadow_unshadow_bf);
+
+ return rc;
+}
+
+void sh_remove_shadows(struct vcpu *v, mfn_t gmfn, int all)
+/* Remove the shadows of this guest page.
+ * If all != 0, find all shadows, if necessary by walking the tables.
+ * Otherwise, just try the (much faster) heuristics, which will remove
+ * at most one reference to each shadow of the page. */
+{
+ struct page_info *pg;
+ mfn_t smfn;
+ u32 sh_flags;
+ unsigned char t;
+
+ /* Dispatch table for getting per-type functions: each level must
+ * be called with the function to remove a lower-level shadow. */
+ static hash_callback_t callbacks[16] = {
+ NULL, /* none */
+ NULL, /* l1_32 */
+ NULL, /* fl1_32 */
+#if CONFIG_PAGING_LEVELS == 2
+ SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,2,2), /* l2_32 */
+#else
+ SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,3,2), /* l2_32 */
+#endif
+ NULL, /* l1_pae */
+ NULL, /* fl1_pae */
+#if CONFIG_PAGING_LEVELS >= 3
+ SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,3,3), /* l2_pae */
+ SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,3,3), /* l2h_pae */
+ SHADOW_INTERNAL_NAME(sh_remove_l2_shadow,3,3), /* l3_pae */
+#else
+ NULL, /* l2_pae */
+ NULL, /* l2h_pae */
+ NULL, /* l3_pae */
+#endif
+ NULL, /* l1_64 */
+ NULL, /* fl1_64 */
+#if CONFIG_PAGING_LEVELS >= 4
+ SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,4,4), /* l2_64 */
+ SHADOW_INTERNAL_NAME(sh_remove_l2_shadow,4,4), /* l3_64 */
+ SHADOW_INTERNAL_NAME(sh_remove_l3_shadow,4,4), /* l4_64 */
+#else
+ NULL, /* l2_64 */
+ NULL, /* l3_64 */
+ NULL, /* l4_64 */
+#endif
+ NULL, /* p2m */
+ NULL /* unused */
+ };
+
+ /* Another lookup table, for choosing which mask to use */
+ static unsigned int masks[16] = {
+ 0, /* none */
+ 1 << (PGC_SH_l2_32_shadow >> PGC_SH_type_shift), /* l1_32 */
+ 0, /* fl1_32 */
+ 0, /* l2_32 */
+ ((1 << (PGC_SH_l2h_pae_shadow >> PGC_SH_type_shift))
+ | (1 << (PGC_SH_l2_pae_shadow >> PGC_SH_type_shift))), /* l1_pae */
+ 0, /* fl1_pae */
+ 1 << (PGC_SH_l3_pae_shadow >> PGC_SH_type_shift), /* l2_pae */
+ 1 << (PGC_SH_l3_pae_shadow >> PGC_SH_type_shift), /* l2h_pae */
+ 0, /* l3_pae */
+ 1 << (PGC_SH_l2_64_shadow >> PGC_SH_type_shift), /* l1_64 */
+ 0, /* fl1_64 */
+ 1 << (PGC_SH_l3_64_shadow >> PGC_SH_type_shift), /* l2_64 */
+ 1 << (PGC_SH_l4_64_shadow >> PGC_SH_type_shift), /* l3_64 */
+ 0, /* l4_64 */
+ 0, /* p2m */
+ 0 /* unused */
+ };
+
+ ASSERT(shadow_lock_is_acquired(v->domain));
+
+ pg = mfn_to_page(gmfn);
+
+ /* Bale out now if the page is not shadowed */
+ if ( (pg->count_info & PGC_page_table) == 0 )
+ return;
+
+ SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx\n",
+ v->domain->domain_id, v->vcpu_id, mfn_x(gmfn));
+
+ /* Search for this shadow in all appropriate shadows */
+ perfc_incrc(shadow_unshadow);
+ sh_flags = pg->shadow_flags;
+
+ /* Lower-level shadows need to be excised from upper-level shadows.
+ * This call to hash_foreach() looks dangerous but is in fact OK: each
+ * call will remove at most one shadow, and terminate immediately when
+ * it does remove it, so we never walk the hash after doing a deletion. */
+#define DO_UNSHADOW(_type) do { \
+ t = (_type) >> PGC_SH_type_shift; \
+ smfn = shadow_hash_lookup(v, mfn_x(gmfn), t); \
+ if ( !sh_remove_shadow_via_pointer(v, smfn) && all ) \
+ hash_foreach(v, masks[t], callbacks, smfn); \
+} while (0)
+
+ /* Top-level shadows need to be unpinned */
+#define DO_UNPIN(_type) do { \
+ t = (_type) >> PGC_SH_type_shift; \
+ smfn = shadow_hash_lookup(v, mfn_x(gmfn), t); \
+ if ( mfn_to_page(smfn)->count_info & PGC_SH_pinned ) \
+ sh_unpin(v, smfn); \
+ if ( (_type) == PGC_SH_l3_pae_shadow ) \
+ SHADOW_INTERNAL_NAME(sh_unpin_all_l3_subshadows,3,3)(v, smfn); \
+} while (0)
+
+ if ( sh_flags & SHF_L1_32 ) DO_UNSHADOW(PGC_SH_l1_32_shadow);
+ if ( sh_flags & SHF_L2_32 ) DO_UNPIN(PGC_SH_l2_32_shadow);
+#if CONFIG_PAGING_LEVELS >= 3
+ if ( sh_flags & SHF_L1_PAE ) DO_UNSHADOW(PGC_SH_l1_pae_shadow);
+ if ( sh_flags & SHF_L2_PAE ) DO_UNSHADOW(PGC_SH_l2_pae_shadow);
+ if ( sh_flags & SHF_L2H_PAE ) DO_UNSHADOW(PGC_SH_l2h_pae_shadow);
+ if ( sh_flags & SHF_L3_PAE ) DO_UNPIN(PGC_SH_l3_pae_shadow);
+#if CONFIG_PAGING_LEVELS >= 4
+ if ( sh_flags & SHF_L1_64 ) DO_UNSHADOW(PGC_SH_l1_64_shadow);
+ if ( sh_flags & SHF_L2_64 ) DO_UNSHADOW(PGC_SH_l2_64_shadow);
+ if ( sh_flags & SHF_L3_64 ) DO_UNSHADOW(PGC_SH_l3_64_shadow);
+ if ( sh_flags & SHF_L4_64 ) DO_UNPIN(PGC_SH_l4_64_shadow);
+#endif
+#endif
+
+#undef DO_UNSHADOW
+#undef DO_UNPIN
+
+
+#if CONFIG_PAGING_LEVELS > 2
+ /* We may have caused some PAE l3 entries to change: need to
+ * fix up the copies of them in various places */
+ if ( sh_flags & (SHF_L2_PAE|SHF_L2H_PAE) )
+ sh_pae_recopy(v->domain);
+#endif
+
+ /* If that didn't catch the shadows, something is wrong */
+ if ( all && (pg->count_info & PGC_page_table) )
+ {
+ SHADOW_ERROR("can't find all shadows of mfn %05lx (shadow_flags=%08x)\n",
+ mfn_x(gmfn), pg->shadow_flags);
+ domain_crash(v->domain);
+ }
+}
+
+void
+shadow_remove_all_shadows_and_parents(struct vcpu *v, mfn_t gmfn)
+/* Even harsher: this is a HVM page that we thing is no longer a pagetable.
+ * Unshadow it, and recursively unshadow pages that reference it. */
+{
+ shadow_remove_all_shadows(v, gmfn);
+ /* XXX TODO:
+ * Rework this hashtable walker to return a linked-list of all
+ * the shadows it modified, then do breadth-first recursion
+ * to find the way up to higher-level tables and unshadow them too.
+ *
+ * The current code (just tearing down each page's shadows as we
+ * detect that it is not a pagetable) is correct, but very slow.
+ * It means extra emulated writes and slows down removal of mappings. */
+}
+
+/**************************************************************************/
+
+void sh_update_paging_modes(struct vcpu *v)
+{
+ struct domain *d = v->domain;
+ struct shadow_paging_mode *old_mode = v->arch.shadow.mode;
+ mfn_t old_guest_table;
+
+ ASSERT(shadow_lock_is_acquired(d));
+
+ // Valid transitions handled by this function:
+ // - For PV guests:
+ // - after a shadow mode has been changed
+ // - For HVM guests:
+ // - after a shadow mode has been changed
+ // - changes in CR0.PG, CR4.PAE, CR4.PSE, or CR4.PGE
+ //
+
+ // Avoid determining the current shadow mode for uninitialized CPUs, as
+ // we can not yet determine whether it is an HVM or PV domain.
+ //
+ if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) )
+ {
+ printk("%s: postponing determination of shadow mode\n", __func__);
+ return;
+ }
+
+ // First, tear down any old shadow tables held by this vcpu.
+ //
+ shadow_detach_old_tables(v);
+
+ if ( !hvm_guest(v) )
+ {
+ ///
+ /// PV guest
+ ///
+#if CONFIG_PAGING_LEVELS == 4
+ if ( pv_32bit_guest(v) )
+ v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,4,3);
+ else
+ v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,4,4);
+#elif CONFIG_PAGING_LEVELS == 3
+ v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3);
+#elif CONFIG_PAGING_LEVELS == 2
+ v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,2,2);
+#else
+#error unexpected paging mode
+#endif
+ }
+ else
+ {
+ ///
+ /// HVM guest
+ ///
+ ASSERT(shadow_mode_translate(d));
+ ASSERT(shadow_mode_external(d));
+
+ v->arch.shadow.hvm_paging_enabled = !!hvm_paging_enabled(v);
+ if ( !v->arch.shadow.hvm_paging_enabled )
+ {
+
+ /* Set v->arch.guest_table to use the p2m map, and choose
+ * the appropriate shadow mode */
+ old_guest_table = pagetable_get_mfn(v->arch.guest_table);
+#if CONFIG_PAGING_LEVELS == 2
+ v->arch.guest_table =
+ pagetable_from_pfn(pagetable_get_pfn(d->arch.phys_table));
+ v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,2,2);
+#elif CONFIG_PAGING_LEVELS == 3
+ v->arch.guest_table =
+ pagetable_from_pfn(pagetable_get_pfn(d->arch.phys_table));
+ v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3);
+#else /* CONFIG_PAGING_LEVELS == 4 */
+ {
+ l4_pgentry_t *l4e;
+ /* Use the start of the first l3 table as a PAE l3 */
+ ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0);
+ l4e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
+ ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT);
+ v->arch.guest_table =
+ pagetable_from_pfn(l4e_get_pfn(l4e[0]));
+ sh_unmap_domain_page(l4e);
+ }
+ v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3);
+#endif
+ /* Fix up refcounts on guest_table */
+ get_page(mfn_to_page(pagetable_get_mfn(v->arch.guest_table)), d);
+ if ( mfn_x(old_guest_table) != 0 )
+ put_page(mfn_to_page(old_guest_table));
+ }
+ else
+ {
+#ifdef __x86_64__
+ if ( hvm_long_mode_enabled(v) )
+ {
+ // long mode guest...
+ v->arch.shadow.mode =
+ &SHADOW_INTERNAL_NAME(sh_paging_mode, 4, 4);
+ }
+ else
+#endif
+ if ( hvm_get_guest_ctrl_reg(v, 4) & X86_CR4_PAE )
+ {
+#if CONFIG_PAGING_LEVELS >= 3
+ // 32-bit PAE mode guest...
+ v->arch.shadow.mode =
+ &SHADOW_INTERNAL_NAME(sh_paging_mode, 3, 3);
+#else
+ SHADOW_ERROR("PAE not supported in 32-bit Xen\n");
+ domain_crash(d);
+ return;
+#endif
+ }
+ else
+ {
+ // 32-bit 2 level guest...
+#if CONFIG_PAGING_LEVELS >= 3
+ v->arch.shadow.mode =
+ &SHADOW_INTERNAL_NAME(sh_paging_mode, 3, 2);
+#else
+ v->arch.shadow.mode =
+ &SHADOW_INTERNAL_NAME(sh_paging_mode, 2, 2);
+#endif
+ }
+ }
+
+ if ( pagetable_get_pfn(v->arch.monitor_table) == 0 )
+ {
+ mfn_t mmfn = shadow_make_monitor_table(v);
+ v->arch.monitor_table = pagetable_from_mfn(mmfn);
+ v->arch.monitor_vtable = sh_map_domain_page(mmfn);
+ }
+
+ if ( v->arch.shadow.mode != old_mode )
+ {
+ SHADOW_PRINTK("new paging mode: d=%u v=%u g=%u s=%u "
+ "(was g=%u s=%u)\n",
+ d->domain_id, v->vcpu_id,
+ v->arch.shadow.mode->guest_levels,
+ v->arch.shadow.mode->shadow_levels,
+ old_mode ? old_mode->guest_levels : 0,
+ old_mode ? old_mode->shadow_levels : 0);
+ if ( old_mode &&
+ (v->arch.shadow.mode->shadow_levels !=
+ old_mode->shadow_levels) )
+ {
+ /* Need to make a new monitor table for the new mode */
+ mfn_t new_mfn, old_mfn;
+
+ if ( v != current )
+ {
+ SHADOW_ERROR("Some third party (d=%u v=%u) is changing "
+ "this HVM vcpu's (d=%u v=%u) paging mode!\n",
+ current->domain->domain_id, current->vcpu_id,
+ v->domain->domain_id, v->vcpu_id);
+ domain_crash(v->domain);
+ return;
+ }
+
+ sh_unmap_domain_page(v->arch.monitor_vtable);
+ old_mfn = pagetable_get_mfn(v->arch.monitor_table);
+ v->arch.monitor_table = pagetable_null();
+ new_mfn = v->arch.shadow.mode->make_monitor_table(v);
+ v->arch.monitor_table = pagetable_from_mfn(new_mfn);
+ v->arch.monitor_vtable = sh_map_domain_page(new_mfn);
+ SHADOW_PRINTK("new monitor table %"SH_PRI_mfn "\n",
+ mfn_x(new_mfn));
+
+ /* Don't be running on the old monitor table when we
+ * pull it down! Switch CR3, and warn the HVM code that
+ * its host cr3 has changed. */
+ make_cr3(v, mfn_x(new_mfn));
+ write_ptbase(v);
+ hvm_update_host_cr3(v);
+ old_mode->destroy_monitor_table(v, old_mfn);
+ }
+ }
+
+ // XXX -- Need to deal with changes in CR4.PSE and CR4.PGE.
+ // These are HARD: think about the case where two CPU's have
+ // different values for CR4.PSE and CR4.PGE at the same time.
+ // This *does* happen, at least for CR4.PGE...
+ }
+
+ v->arch.shadow.mode->update_cr3(v);
+}
+
+/**************************************************************************/
+/* Turning on and off shadow features */
+
+static void sh_new_mode(struct domain *d, u32 new_mode)
+/* Inform all the vcpus that the shadow mode has been changed */
+{
+ struct vcpu *v;
+
+ ASSERT(shadow_lock_is_acquired(d));
+ ASSERT(d != current->domain);
+ d->arch.shadow.mode = new_mode;
+ if ( new_mode & SHM2_translate )
+ shadow_audit_p2m(d);
+ for_each_vcpu(d, v)
+ sh_update_paging_modes(v);
+}
+
+static int shadow_enable(struct domain *d, u32 mode)
+/* Turn on "permanent" shadow features: external, translate, refcount.
+ * Can only be called once on a domain, and these features cannot be
+ * disabled.
+ * Returns 0 for success, -errno for failure. */
+{
+ unsigned int old_pages;
+ int rv = 0;
+
+ mode |= SHM2_enable;
+
+ domain_pause(d);
+ shadow_lock(d);
+
+ /* Sanity check the arguments */
+ if ( (d == current->domain) ||
+ shadow_mode_enabled(d) ||
+ ((mode & SHM2_external) && !(mode & SHM2_translate)) )
+ {
+ rv = -EINVAL;
+ goto out;
+ }
+
+ // XXX -- eventually would like to require that all memory be allocated
+ // *after* shadow_enabled() is called... So here, we would test to make
+ // sure that d->page_list is empty.
+#if 0
+ spin_lock(&d->page_alloc_lock);
+ if ( !list_empty(&d->page_list) )
+ {
+ spin_unlock(&d->page_alloc_lock);
+ rv = -EINVAL;
+ goto out;
+ }
+ spin_unlock(&d->page_alloc_lock);
+#endif
+
+ /* Init the shadow memory allocation if the user hasn't done so */
+ old_pages = d->arch.shadow.total_pages;
+ if ( old_pages == 0 )
+ if ( set_sh_allocation(d, 256, NULL) != 0 ) /* Use at least 1MB */
+ {
+ set_sh_allocation(d, 0, NULL);
+ rv = -ENOMEM;
+ goto out;
+ }
+
+ /* Init the hash table */
+ if ( shadow_hash_alloc(d) != 0 )
+ {
+ set_sh_allocation(d, old_pages, NULL);
+ rv = -ENOMEM;
+ goto out;
+ }
+
+ /* Init the P2M table */
+ if ( mode & SHM2_translate )
+ if ( !shadow_alloc_p2m_table(d) )
+ {
+ shadow_hash_teardown(d);
+ set_sh_allocation(d, old_pages, NULL);
+ shadow_p2m_teardown(d);
+ rv = -ENOMEM;
+ goto out;
+ }
+
+ /* Update the bits */
+ sh_new_mode(d, mode);
+ shadow_audit_p2m(d);
+ out:
+ shadow_unlock(d);
+ domain_unpause(d);
+ return 0;
+}
+
+void shadow_teardown(struct domain *d)
+/* Destroy the shadow pagetables of this domain and free its shadow memory.
+ * Should only be called for dying domains. */
+{
+ struct vcpu *v;
+ mfn_t mfn;
+
+ ASSERT(test_bit(_DOMF_dying, &d->domain_flags));
+ ASSERT(d != current->domain);
+
+ if ( !shadow_lock_is_acquired(d) )
+ shadow_lock(d); /* Keep various asserts happy */
+
+ if ( shadow_mode_enabled(d) )
+ {
+ /* Release the shadow and monitor tables held by each vcpu */
+ for_each_vcpu(d, v)
+ {
+ shadow_detach_old_tables(v);
+ if ( shadow_mode_external(d) )
+ {
+ mfn = pagetable_get_mfn(v->arch.monitor_table);
+ if ( valid_mfn(mfn) && (mfn_x(mfn) != 0) )
+ shadow_destroy_monitor_table(v, mfn);
+ v->arch.monitor_table = pagetable_null();
+ }
+ }
+ }
+
+ if ( d->arch.shadow.total_pages != 0 )
+ {
+ SHADOW_PRINTK("teardown of domain %u starts."
+ " Shadow pages total = %u, free = %u, p2m=%u\n",
+ d->domain_id,
+ d->arch.shadow.total_pages,
+ d->arch.shadow.free_pages,
+ d->arch.shadow.p2m_pages);
+ /* Destroy all the shadows and release memory to domheap */
+ set_sh_allocation(d, 0, NULL);
+ /* Release the hash table back to xenheap */
+ if (d->arch.shadow.hash_table)
+ shadow_hash_teardown(d);
+ /* Release the log-dirty bitmap of dirtied pages */
+ sh_free_log_dirty_bitmap(d);
+ /* Should not have any more memory held */
+ SHADOW_PRINTK("teardown done."
+ " Shadow pages total = %u, free = %u, p2m=%u\n",
+ d->arch.shadow.total_pages,
+ d->arch.shadow.free_pages,
+ d->arch.shadow.p2m_pages);
+ ASSERT(d->arch.shadow.total_pages == 0);
+ }
+
+ /* We leave the "permanent" shadow modes enabled, but clear the
+ * log-dirty mode bit. We don't want any more mark_dirty()
+ * calls now that we've torn down the bitmap */
+ d->arch.shadow.mode &= ~SHM2_log_dirty;
+
+ shadow_unlock(d);
+}
+
+void shadow_final_teardown(struct domain *d)
+/* Called by arch_domain_destroy(), when it's safe to pull down the p2m map. */
+{
+
+ SHADOW_PRINTK("dom %u final teardown starts."
+ " Shadow pages total = %u, free = %u, p2m=%u\n",
+ d->domain_id,
+ d->arch.shadow.total_pages,
+ d->arch.shadow.free_pages,
+ d->arch.shadow.p2m_pages);
+
+ /* Double-check that the domain didn't have any shadow memory.
+ * It is possible for a domain that never got domain_kill()ed
+ * to get here with its shadow allocation intact. */
+ if ( d->arch.shadow.total_pages != 0 )
+ shadow_teardown(d);
+
+ /* It is now safe to pull down the p2m map. */
+ if ( d->arch.shadow.p2m_pages != 0 )
+ shadow_p2m_teardown(d);
+
+ SHADOW_PRINTK("dom %u final teardown done."
+ " Shadow pages total = %u, free = %u, p2m=%u\n",
+ d->domain_id,
+ d->arch.shadow.total_pages,
+ d->arch.shadow.free_pages,
+ d->arch.shadow.p2m_pages);
+}
+
+static int shadow_one_bit_enable(struct domain *d, u32 mode)
+/* Turn on a single shadow mode feature */
+{
+ ASSERT(shadow_lock_is_acquired(d));
+
+ /* Sanity check the call */
+ if ( d == current->domain || (d->arch.shadow.mode & mode) )
+ {
+ return -EINVAL;
+ }
+
+ if ( d->arch.shadow.mode == 0 )
+ {
+ /* Init the shadow memory allocation and the hash table */
+ if ( set_sh_allocation(d, 1, NULL) != 0
+ || shadow_hash_alloc(d) != 0 )
+ {
+ set_sh_allocation(d, 0, NULL);
+ return -ENOMEM;
+ }
+ }
+
+ /* Update the bits */
+ sh_new_mode(d, d->arch.shadow.mode | mode);
+
+ return 0;
+}
+
+static int shadow_one_bit_disable(struct domain *d, u32 mode)
+/* Turn off a single shadow mode feature */
+{
+ struct vcpu *v;
+ ASSERT(shadow_lock_is_acquired(d));
+
+ /* Sanity check the call */
+ if ( d == current->domain || !(d->arch.shadow.mode & mode) )
+ {
+ return -EINVAL;
+ }
+
+ /* Update the bits */
+ sh_new_mode(d, d->arch.shadow.mode & ~mode);
+ if ( d->arch.shadow.mode == 0 )
+ {
+ /* Get this domain off shadows */
+ SHADOW_PRINTK("un-shadowing of domain %u starts."
+ " Shadow pages total = %u, free = %u, p2m=%u\n",
+ d->domain_id,
+ d->arch.shadow.total_pages,
+ d->arch.shadow.free_pages,
+ d->arch.shadow.p2m_pages);
+ for_each_vcpu(d, v)
+ {
+ shadow_detach_old_tables(v);
+#if CONFIG_PAGING_LEVELS == 4
+ if ( !(v->arch.flags & TF_kernel_mode) )
+ make_cr3(v, pagetable_get_pfn(v->arch.guest_table_user));
+ else
+#endif
+ make_cr3(v, pagetable_get_pfn(v->arch.guest_table));
+
+ }
+
+ /* Pull down the memory allocation */
+ if ( set_sh_allocation(d, 0, NULL) != 0 )
+ {
+ // XXX - How can this occur?
+ // Seems like a bug to return an error now that we've
+ // disabled the relevant shadow mode.
+ //
+ return -ENOMEM;
+ }
+ shadow_hash_teardown(d);
+ SHADOW_PRINTK("un-shadowing of domain %u done."
+ " Shadow pages total = %u, free = %u, p2m=%u\n",
+ d->domain_id,
+ d->arch.shadow.total_pages,
+ d->arch.shadow.free_pages,
+ d->arch.shadow.p2m_pages);
+ }
+
+ return 0;
+}
+
+/* Enable/disable ops for the "test" and "log-dirty" modes */
+int shadow_test_enable(struct domain *d)
+{
+ int ret;
+
+ domain_pause(d);
+ shadow_lock(d);
+
+ if ( shadow_mode_enabled(d) )
+ {
+ SHADOW_ERROR("Don't support enabling test mode"
+ "on already shadowed doms\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = shadow_one_bit_enable(d, SHM2_enable);
+ out:
+ shadow_unlock(d);
+ domain_unpause(d);
+
+ return ret;
+}
+
+int shadow_test_disable(struct domain *d)
+{
+ int ret;
+
+ domain_pause(d);
+ shadow_lock(d);
+ ret = shadow_one_bit_disable(d, SHM2_enable);
+ shadow_unlock(d);
+ domain_unpause(d);
+
+ return ret;
+}
+
+static int
+sh_alloc_log_dirty_bitmap(struct domain *d)
+{
+ ASSERT(d->arch.shadow.dirty_bitmap == NULL);
+ d->arch.shadow.dirty_bitmap_size =
+ (d->shared_info->arch.max_pfn + (BITS_PER_LONG - 1)) &
+ ~(BITS_PER_LONG - 1);
+ d->arch.shadow.dirty_bitmap =
+ xmalloc_array(unsigned long,
+ d->arch.shadow.dirty_bitmap_size / BITS_PER_LONG);
+ if ( d->arch.shadow.dirty_bitmap == NULL )
+ {
+ d->arch.shadow.dirty_bitmap_size = 0;
+ return -ENOMEM;
+ }
+ memset(d->arch.shadow.dirty_bitmap, 0, d->arch.shadow.dirty_bitmap_size/8);
+
+ return 0;
+}
+
+static void
+sh_free_log_dirty_bitmap(struct domain *d)
+{
+ d->arch.shadow.dirty_bitmap_size = 0;
+ if ( d->arch.shadow.dirty_bitmap )
+ {
+ xfree(d->arch.shadow.dirty_bitmap);
+ d->arch.shadow.dirty_bitmap = NULL;
+ }
+}
+
+static int shadow_log_dirty_enable(struct domain *d)
+{
+ int ret;
+
+ domain_pause(d);
+ shadow_lock(d);
+
+ if ( shadow_mode_log_dirty(d) )
+ {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if ( shadow_mode_enabled(d) )
+ {
+ SHADOW_ERROR("Don't (yet) support enabling log-dirty"
+ "on already shadowed doms\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = sh_alloc_log_dirty_bitmap(d);
+ if ( ret != 0 )
+ {
+ sh_free_log_dirty_bitmap(d);
+ goto out;
+ }
+
+ ret = shadow_one_bit_enable(d, SHM2_log_dirty);
+ if ( ret != 0 )
+ sh_free_log_dirty_bitmap(d);
+
+ out:
+ shadow_unlock(d);
+ domain_unpause(d);
+ return ret;
+}
+
+static int shadow_log_dirty_disable(struct domain *d)
+{
+ int ret;
+
+ domain_pause(d);
+ shadow_lock(d);
+ ret = shadow_one_bit_disable(d, SHM2_log_dirty);
+ if ( !shadow_mode_log_dirty(d) )
+ sh_free_log_dirty_bitmap(d);
+ shadow_unlock(d);
+ domain_unpause(d);
+
+ return ret;
+}
+
+/**************************************************************************/
+/* P2M map manipulations */
+
+static void
+sh_p2m_remove_page(struct domain *d, unsigned long gfn, unsigned long mfn)
+{
+ struct vcpu *v;
+
+ if ( !shadow_mode_translate(d) )
+ return;
+
+ v = current;
+ if ( v->domain != d )
+ v = d->vcpu[0];
+
+
+ SHADOW_DEBUG(P2M, "removing gfn=%#lx mfn=%#lx\n", gfn, mfn);
+
+ ASSERT(mfn_x(sh_gfn_to_mfn(d, gfn)) == mfn);
+ //ASSERT(sh_mfn_to_gfn(d, mfn) == gfn);
+
+ shadow_remove_all_shadows_and_parents(v, _mfn(mfn));
+ if ( shadow_remove_all_mappings(v, _mfn(mfn)) )
+ flush_tlb_mask(d->domain_dirty_cpumask);
+ shadow_set_p2m_entry(d, gfn, _mfn(INVALID_MFN));
+ set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
+}
+
+void
+shadow_guest_physmap_remove_page(struct domain *d, unsigned long gfn,
+ unsigned long mfn)
+{
+ shadow_lock(d);
+ shadow_audit_p2m(d);
+ sh_p2m_remove_page(d, gfn, mfn);
+ shadow_audit_p2m(d);
+ shadow_unlock(d);
+}
+
+void
+shadow_guest_physmap_add_page(struct domain *d, unsigned long gfn,
+ unsigned long mfn)
+{
+ struct vcpu *v;
+ unsigned long ogfn;
+ mfn_t omfn;
+
+ if ( !shadow_mode_translate(d) )
+ return;
+
+ v = current;
+ if ( v->domain != d )
+ v = d->vcpu[0];
+
+ shadow_lock(d);
+ shadow_audit_p2m(d);
+
+ SHADOW_DEBUG(P2M, "adding gfn=%#lx mfn=%#lx\n", gfn, mfn);
+
+ omfn = sh_gfn_to_mfn(d, gfn);
+ if ( valid_mfn(omfn) )
+ {
+ /* Get rid of the old mapping, especially any shadows */
+ shadow_remove_all_shadows_and_parents(v, omfn);
+ if ( shadow_remove_all_mappings(v, omfn) )
+ flush_tlb_mask(d->domain_dirty_cpumask);
+ set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
+ }
+
+ ogfn = sh_mfn_to_gfn(d, _mfn(mfn));
+ if (
+#ifdef __x86_64__
+ (ogfn != 0x5555555555555555L)
+#else
+ (ogfn != 0x55555555L)
+#endif
+ && (ogfn != INVALID_M2P_ENTRY)
+ && (ogfn != gfn) )
+ {
+ /* This machine frame is already mapped at another physical address */
+ SHADOW_DEBUG(P2M, "aliased! mfn=%#lx, old gfn=%#lx, new gfn=%#lx\n",
+ mfn, ogfn, gfn);
+ if ( valid_mfn(omfn = sh_gfn_to_mfn(d, ogfn)) )
+ {
+ SHADOW_DEBUG(P2M, "old gfn=%#lx -> mfn %#lx\n",
+ ogfn , mfn_x(omfn));
+ if ( mfn_x(omfn) == mfn )
+ sh_p2m_remove_page(d, ogfn, mfn);
+ }
+ }
+
+ shadow_set_p2m_entry(d, gfn, _mfn(mfn));
+ set_gpfn_from_mfn(mfn, gfn);
+ shadow_audit_p2m(d);
+ shadow_unlock(d);
+}
+
+/**************************************************************************/
+/* Log-dirty mode support */
+
+/* Convert a shadow to log-dirty mode. */
+void shadow_convert_to_log_dirty(struct vcpu *v, mfn_t smfn)
+{
+ BUG();
+}
+
+
+/* Read a domain's log-dirty bitmap and stats.
+ * If the operation is a CLEAN, clear the bitmap and stats as well. */
+static int shadow_log_dirty_op(
+ struct domain *d, struct xen_domctl_shadow_op *sc)
+{
+ int i, rv = 0, clean = 0;
+
+ domain_pause(d);
+ shadow_lock(d);
+
+ clean = (sc->op == XEN_DOMCTL_SHADOW_OP_CLEAN);
+
+ SHADOW_DEBUG(LOGDIRTY, "log-dirty %s: dom %u faults=%u dirty=%u\n",
+ (clean) ? "clean" : "peek",
+ d->domain_id,
+ d->arch.shadow.fault_count,
+ d->arch.shadow.dirty_count);
+
+ sc->stats.fault_count = d->arch.shadow.fault_count;
+ sc->stats.dirty_count = d->arch.shadow.dirty_count;
+
+ if ( clean )
+ {
+ struct list_head *l, *t;
+ struct page_info *pg;
+
+ /* Need to revoke write access to the domain's pages again.
+ * In future, we'll have a less heavy-handed approach to this,
+ * but for now, we just unshadow everything except Xen. */
+ list_for_each_safe(l, t, &d->arch.shadow.toplevel_shadows)
+ {
+ pg = list_entry(l, struct page_info, list);
+ shadow_unhook_mappings(d->vcpu[0], page_to_mfn(pg));
+ }
+
+ d->arch.shadow.fault_count = 0;
+ d->arch.shadow.dirty_count = 0;
+ }
+
+ if ( guest_handle_is_null(sc->dirty_bitmap) ||
+ (d->arch.shadow.dirty_bitmap == NULL) )
+ {
+ rv = -EINVAL;
+ goto out;
+ }
+
+ if ( sc->pages > d->arch.shadow.dirty_bitmap_size )
+ sc->pages = d->arch.shadow.dirty_bitmap_size;
+
+#define CHUNK (8*1024) /* Transfer and clear in 1kB chunks for L1 cache. */
+ for ( i = 0; i < sc->pages; i += CHUNK )
+ {
+ int bytes = ((((sc->pages - i) > CHUNK)
+ ? CHUNK
+ : (sc->pages - i)) + 7) / 8;
+
+ if ( copy_to_guest_offset(
+ sc->dirty_bitmap,
+ i/(8*sizeof(unsigned long)),
+ d->arch.shadow.dirty_bitmap + (i/(8*sizeof(unsigned long))),
+ (bytes + sizeof(unsigned long) - 1) / sizeof(unsigned long)) )
+ {
+ rv = -EINVAL;
+ goto out;
+ }
+
+ if ( clean )
+ memset(d->arch.shadow.dirty_bitmap + (i/(8*sizeof(unsigned long))),
+ 0, bytes);
+ }
+#undef CHUNK
+
+ out:
+ shadow_unlock(d);
+ domain_unpause(d);
+ return 0;
+}
+
+
+/* Mark a page as dirty */
+void sh_do_mark_dirty(struct domain *d, mfn_t gmfn)
+{
+ unsigned long pfn;
+
+ ASSERT(shadow_lock_is_acquired(d));
+ ASSERT(shadow_mode_log_dirty(d));
+
+ if ( !valid_mfn(gmfn) )
+ return;
+
+ ASSERT(d->arch.shadow.dirty_bitmap != NULL);
+
+ /* We /really/ mean PFN here, even for non-translated guests. */
+ pfn = get_gpfn_from_mfn(mfn_x(gmfn));
+
+ /*
+ * Values with the MSB set denote MFNs that aren't really part of the
+ * domain's pseudo-physical memory map (e.g., the shared info frame).
+ * Nothing to do here...
+ */
+ if ( unlikely(!VALID_M2P(pfn)) )
+ return;
+
+ /* N.B. Can use non-atomic TAS because protected by shadow_lock. */
+ if ( likely(pfn < d->arch.shadow.dirty_bitmap_size) )
+ {
+ if ( !__test_and_set_bit(pfn, d->arch.shadow.dirty_bitmap) )
+ {
+ SHADOW_DEBUG(LOGDIRTY,
+ "marked mfn %" SH_PRI_mfn " (pfn=%lx), dom %d\n",
+ mfn_x(gmfn), pfn, d->domain_id);
+ d->arch.shadow.dirty_count++;
+ }
+ }
+ else
+ {
+ SHADOW_PRINTK("mark_dirty OOR! "
+ "mfn=%" SH_PRI_mfn " pfn=%lx max=%x (dom %d)\n"
+ "owner=%d c=%08x t=%" PRtype_info "\n",
+ mfn_x(gmfn),
+ pfn,
+ d->arch.shadow.dirty_bitmap_size,
+ d->domain_id,
+ (page_get_owner(mfn_to_page(gmfn))
+ ? page_get_owner(mfn_to_page(gmfn))->domain_id
+ : -1),
+ mfn_to_page(gmfn)->count_info,
+ mfn_to_page(gmfn)->u.inuse.type_info);
+ }
+}
+
+
+/**************************************************************************/
+/* Shadow-control XEN_DOMCTL dispatcher */
+
+int shadow_domctl(struct domain *d,
+ xen_domctl_shadow_op_t *sc,
+ XEN_GUEST_HANDLE(xen_domctl_t) u_domctl)
+{
+ int rc, preempted = 0;
+
+ if ( unlikely(d == current->domain) )
+ {
+ DPRINTK("Don't try to do a shadow op on yourself!\n");
+ return -EINVAL;
+ }
+
+ switch ( sc->op )
+ {
+ case XEN_DOMCTL_SHADOW_OP_OFF:
+ if ( shadow_mode_log_dirty(d) )
+ if ( (rc = shadow_log_dirty_disable(d)) != 0 )
+ return rc;
+ if ( d->arch.shadow.mode & SHM2_enable )
+ if ( (rc = shadow_test_disable(d)) != 0 )
+ return rc;
+ return 0;
+
+ case XEN_DOMCTL_SHADOW_OP_ENABLE_TEST:
+ return shadow_test_enable(d);
+
+ case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY:
+ return shadow_log_dirty_enable(d);
+
+ case XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE:
+ return shadow_enable(d, SHM2_refcounts|SHM2_translate);
+
+ case XEN_DOMCTL_SHADOW_OP_CLEAN:
+ case XEN_DOMCTL_SHADOW_OP_PEEK:
+ return shadow_log_dirty_op(d, sc);
+
+ case XEN_DOMCTL_SHADOW_OP_ENABLE:
+ if ( sc->mode & XEN_DOMCTL_SHADOW_ENABLE_LOG_DIRTY )
+ return shadow_log_dirty_enable(d);
+ return shadow_enable(d, sc->mode << SHM2_shift);
+
+ case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
+ sc->mb = shadow_get_allocation(d);
+ return 0;
+
+ case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
+ rc = shadow_set_allocation(d, sc->mb, &preempted);
+ if ( preempted )
+ /* Not finished. Set up to re-run the call. */
+ rc = hypercall_create_continuation(
+ __HYPERVISOR_domctl, "h", u_domctl);
+ else
+ /* Finished. Return the new allocation */
+ sc->mb = shadow_get_allocation(d);
+ return rc;
+
+ default:
+ SHADOW_ERROR("Bad shadow op %u\n", sc->op);
+ return -EINVAL;
+ }
+}
+
+
+/**************************************************************************/
+/* Auditing shadow tables */
+
+#if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL
+
+void shadow_audit_tables(struct vcpu *v)
+{
+ /* Dispatch table for getting per-type functions */
+ static hash_callback_t callbacks[16] = {
+ NULL, /* none */
+#if CONFIG_PAGING_LEVELS == 2
+ SHADOW_INTERNAL_NAME(sh_audit_l1_table,2,2), /* l1_32 */
+ SHADOW_INTERNAL_NAME(sh_audit_fl1_table,2,2), /* fl1_32 */
+ SHADOW_INTERNAL_NAME(sh_audit_l2_table,2,2), /* l2_32 */
+#else
+ SHADOW_INTERNAL_NAME(sh_audit_l1_table,3,2), /* l1_32 */
+ SHADOW_INTERNAL_NAME(sh_audit_fl1_table,3,2), /* fl1_32 */
+ SHADOW_INTERNAL_NAME(sh_audit_l2_table,3,2), /* l2_32 */
+ SHADOW_INTERNAL_NAME(sh_audit_l1_table,3,3), /* l1_pae */
+ SHADOW_INTERNAL_NAME(sh_audit_fl1_table,3,3), /* fl1_pae */
+ SHADOW_INTERNAL_NAME(sh_audit_l2_table,3,3), /* l2_pae */
+ SHADOW_INTERNAL_NAME(sh_audit_l2_table,3,3), /* l2h_pae */
+ SHADOW_INTERNAL_NAME(sh_audit_l3_table,3,3), /* l3_pae */
+#if CONFIG_PAGING_LEVELS >= 4
+ SHADOW_INTERNAL_NAME(sh_audit_l1_table,4,4), /* l1_64 */
+ SHADOW_INTERNAL_NAME(sh_audit_fl1_table,4,4), /* fl1_64 */
+ SHADOW_INTERNAL_NAME(sh_audit_l2_table,4,4), /* l2_64 */
+ SHADOW_INTERNAL_NAME(sh_audit_l3_table,4,4), /* l3_64 */
+ SHADOW_INTERNAL_NAME(sh_audit_l4_table,4,4), /* l4_64 */
+#endif /* CONFIG_PAGING_LEVELS >= 4 */
+#endif /* CONFIG_PAGING_LEVELS > 2 */
+ NULL /* All the rest */
+ };
+ unsigned int mask;
+
+ if ( !(SHADOW_AUDIT_ENABLE) )
+ return;
+
+ if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL )
+ mask = ~1; /* Audit every table in the system */
+ else
+ {
+ /* Audit only the current mode's tables */
+ switch ( v->arch.shadow.mode->guest_levels )
+ {
+ case 2: mask = (SHF_L1_32|SHF_FL1_32|SHF_L2_32); break;
+ case 3: mask = (SHF_L1_PAE|SHF_FL1_PAE|SHF_L2_PAE
+ |SHF_L2H_PAE|SHF_L3_PAE); break;
+ case 4: mask = (SHF_L1_64|SHF_FL1_64|SHF_L2_64
+ |SHF_L3_64|SHF_L4_64); break;
+ default: BUG();
+ }
+ }
+
+ hash_foreach(v, ~1, callbacks, _mfn(INVALID_MFN));
+}
+
+#endif /* Shadow audit */
+
+
+/**************************************************************************/
+/* Auditing p2m tables */
+
+#if SHADOW_AUDIT & SHADOW_AUDIT_P2M
+
+void shadow_audit_p2m(struct domain *d)
+{
+ struct list_head *entry;
+ struct page_info *page;
+ struct domain *od;
+ unsigned long mfn, gfn, m2pfn, lp2mfn = 0;
+ mfn_t p2mfn;
+ unsigned long orphans_d = 0, orphans_i = 0, mpbad = 0, pmbad = 0;
+ int test_linear;
+
+ if ( !(SHADOW_AUDIT_ENABLE) || !shadow_mode_translate(d) )
+ return;
+
+ //SHADOW_PRINTK("p2m audit starts\n");
+
+ test_linear = ( (d == current->domain) && current->arch.monitor_vtable );
+ if ( test_linear )
+ local_flush_tlb();
+
+ /* Audit part one: walk the domain's page allocation list, checking
+ * the m2p entries. */
+ for ( entry = d->page_list.next;
+ entry != &d->page_list;
+ entry = entry->next )
+ {
+ page = list_entry(entry, struct page_info, list);
+ mfn = mfn_x(page_to_mfn(page));
+
+ // SHADOW_PRINTK("auditing guest page, mfn=%#lx\n", mfn);
+
+ od = page_get_owner(page);
+
+ if ( od != d )
+ {
+ SHADOW_PRINTK("wrong owner %#lx -> %p(%u) != %p(%u)\n",
+ mfn, od, (od?od->domain_id:-1), d, d->domain_id);
+ continue;
+ }
+
+ gfn = get_gpfn_from_mfn(mfn);
+ if ( gfn == INVALID_M2P_ENTRY )
+ {
+ orphans_i++;
+ //SHADOW_PRINTK("orphaned guest page: mfn=%#lx has invalid gfn\n",
+ // mfn);
+ continue;
+ }
+
+ if ( gfn == 0x55555555 )
+ {
+ orphans_d++;
+ //SHADOW_PRINTK("orphaned guest page: mfn=%#lx has debug gfn\n",
+ // mfn);
+ continue;
+ }
+
+ p2mfn = sh_gfn_to_mfn_foreign(d, gfn);
+ if ( mfn_x(p2mfn) != mfn )
+ {
+ mpbad++;
+ SHADOW_PRINTK("map mismatch mfn %#lx -> gfn %#lx -> mfn %#lx"
+ " (-> gfn %#lx)\n",
+ mfn, gfn, mfn_x(p2mfn),
+ (mfn_valid(p2mfn)
+ ? get_gpfn_from_mfn(mfn_x(p2mfn))
+ : -1u));
+ /* This m2p entry is stale: the domain has another frame in
+ * this physical slot. No great disaster, but for neatness,
+ * blow away the m2p entry. */
+ set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
+ }
+
+ if ( test_linear )
+ {
+ lp2mfn = get_mfn_from_gpfn(gfn);
+ if ( lp2mfn != mfn_x(p2mfn) )
+ {
+ SHADOW_PRINTK("linear mismatch gfn %#lx -> mfn %#lx "
+ "(!= mfn %#lx)\n", gfn, lp2mfn, p2mfn);
+ }
+ }
+
+ // SHADOW_PRINTK("OK: mfn=%#lx, gfn=%#lx, p2mfn=%#lx, lp2mfn=%#lx\n",
+ // mfn, gfn, p2mfn, lp2mfn);
+ }
+
+ /* Audit part two: walk the domain's p2m table, checking the entries. */
+ if ( pagetable_get_pfn(d->arch.phys_table) != 0 )
+ {
+ l2_pgentry_t *l2e;
+ l1_pgentry_t *l1e;
+ int i1, i2;
+
+#if CONFIG_PAGING_LEVELS == 4
+ l4_pgentry_t *l4e;
+ l3_pgentry_t *l3e;
+ int i3, i4;
+ l4e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
+#elif CONFIG_PAGING_LEVELS == 3
+ l3_pgentry_t *l3e;
+ int i3;
+ l3e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
+#else /* CONFIG_PAGING_LEVELS == 2 */
+ l2e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
+#endif
+
+ gfn = 0;
+#if CONFIG_PAGING_LEVELS >= 3
+#if CONFIG_PAGING_LEVELS >= 4
+ for ( i4 = 0; i4 < L4_PAGETABLE_ENTRIES; i4++ )
+ {
+ if ( !(l4e_get_flags(l4e[i4]) & _PAGE_PRESENT) )
+ {
+ gfn += 1 << (L4_PAGETABLE_SHIFT - PAGE_SHIFT);
+ continue;
+ }
+ l3e = sh_map_domain_page(_mfn(l4e_get_pfn(l4e[i4])));
+#endif /* now at levels 3 or 4... */
+ for ( i3 = 0;
+ i3 < ((CONFIG_PAGING_LEVELS==4) ? L3_PAGETABLE_ENTRIES : 8);
+ i3++ )
+ {
+ if ( !(l3e_get_flags(l3e[i3]) & _PAGE_PRESENT) )
+ {
+ gfn += 1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
+ continue;
+ }
+ l2e = sh_map_domain_page(_mfn(l3e_get_pfn(l3e[i3])));
+#endif /* all levels... */
+ for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ )
+ {
+ if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) )
+ {
+ gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT);
+ continue;
+ }
+ l1e = sh_map_domain_page(_mfn(l2e_get_pfn(l2e[i2])));
+
+ for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ )
+ {
+ if ( !(l1e_get_flags(l1e[i1]) & _PAGE_PRESENT) )
+ continue;
+ mfn = l1e_get_pfn(l1e[i1]);
+ ASSERT(valid_mfn(_mfn(mfn)));
+ m2pfn = get_gpfn_from_mfn(mfn);
+ if ( m2pfn != gfn )
+ {
+ pmbad++;
+ SHADOW_PRINTK("mismatch: gfn %#lx -> mfn %#lx"
+ " -> gfn %#lx\n", gfn, mfn, m2pfn);
+ BUG();
+ }
+ }
+ sh_unmap_domain_page(l1e);
+ }
+#if CONFIG_PAGING_LEVELS >= 3
+ sh_unmap_domain_page(l2e);
+ }
+#if CONFIG_PAGING_LEVELS >= 4
+ sh_unmap_domain_page(l3e);
+ }
+#endif
+#endif
+
+#if CONFIG_PAGING_LEVELS == 4
+ sh_unmap_domain_page(l4e);
+#elif CONFIG_PAGING_LEVELS == 3
+ sh_unmap_domain_page(l3e);
+#else /* CONFIG_PAGING_LEVELS == 2 */
+ sh_unmap_domain_page(l2e);
+#endif
+
+ }
+
+ //SHADOW_PRINTK("p2m audit complete\n");
+ //if ( orphans_i | orphans_d | mpbad | pmbad )
+ // SHADOW_PRINTK("p2m audit found %lu orphans (%lu inval %lu debug)\n",
+ // orphans_i + orphans_d, orphans_i, orphans_d,
+ if ( mpbad | pmbad )
+ SHADOW_PRINTK("p2m audit found %lu odd p2m, %lu bad m2p entries\n",
+ pmbad, mpbad);
+}
+
+#endif /* p2m audit */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+/******************************************************************************
+ * arch/x86/mm/shadow/multi.c
+ *
+ * Simple, mostly-synchronous shadow page tables.
+ * Parts of this code are Copyright (c) 2006 by XenSource Inc.
+ * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
+ * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+// DESIGN QUESTIONS:
+// Why use subshadows for PAE guests?
+// - reduces pressure in the hash table
+// - reduces shadow size (64-vs-4096 bytes of shadow for 32 bytes of guest L3)
+// - would need to find space in the page_info to store 7 more bits of
+// backpointer
+// - independent shadows of 32 byte chunks makes it non-obvious how to quickly
+// figure out when to demote the guest page from l3 status
+//
+// PAE Xen HVM guests are restricted to 8GB of pseudo-physical address space.
+// - Want to map the P2M table into the 16MB RO_MPT hole in Xen's address
+// space for both PV and HVM guests.
+//
+
+#define SHADOW 1
+
+#include <xen/config.h>
+#include <xen/types.h>
+#include <xen/mm.h>
+#include <xen/trace.h>
+#include <xen/sched.h>
+#include <xen/perfc.h>
+#include <xen/domain_page.h>
+#include <asm/page.h>
+#include <asm/current.h>
+#include <asm/shadow.h>
+#include <asm/flushtlb.h>
+#include <asm/hvm/hvm.h>
+#include "private.h"
+#include "types.h"
+
+/* The first cut: an absolutely synchronous, trap-and-emulate version,
+ * supporting only HVM guests (and so only "external" shadow mode).
+ *
+ * THINGS TO DO LATER:
+ *
+ * FIX GVA_TO_GPA
+ * The current interface returns an unsigned long, which is not big enough
+ * to hold a physical address in PAE. Should return a gfn instead.
+ *
+ * TEARDOWN HEURISTICS
+ * Also: have a heuristic for when to destroy a previous paging-mode's
+ * shadows. When a guest is done with its start-of-day 32-bit tables
+ * and reuses the memory we want to drop those shadows. Start with
+ * shadows in a page in two modes as a hint, but beware of clever tricks
+ * like reusing a pagetable for both PAE and 64-bit during boot...
+ *
+ * PAE LINEAR MAPS
+ * Rework shadow_get_l*e() to have the option of using map_domain_page()
+ * instead of linear maps. Add appropriate unmap_l*e calls in the users.
+ * Then we can test the speed difference made by linear maps. If the
+ * map_domain_page() version is OK on PAE, we could maybe allow a lightweight
+ * l3-and-l2h-only shadow mode for PAE PV guests that would allow them
+ * to share l2h pages again.
+ *
+ * PAE L3 COPYING
+ * In this code, we copy all 32 bytes of a PAE L3 every time we change an
+ * entry in it, and every time we change CR3. We copy it for the linear
+ * mappings (ugh! PAE linear mappings) and we copy it to the low-memory
+ * buffer so it fits in CR3. Maybe we can avoid some of this recopying
+ * by using the shadow directly in some places.
+ * Also, for SMP, need to actually respond to seeing shadow.pae_flip_pending.
+ *
+ * GUEST_WALK_TABLES TLB FLUSH COALESCE
+ * guest_walk_tables can do up to three remote TLB flushes as it walks to
+ * the first l1 of a new pagetable. Should coalesce the flushes to the end,
+ * and if we do flush, re-do the walk. If anything has changed, then
+ * pause all the other vcpus and do the walk *again*.
+ *
+ * WP DISABLED
+ * Consider how to implement having the WP bit of CR0 set to 0.
+ * Since we need to be able to cause write faults to pagetables, this might
+ * end up looking like not having the (guest) pagetables present at all in
+ * HVM guests...
+ *
+ * PSE disabled / PSE36
+ * We don't support any modes other than PSE enabled, PSE36 disabled.
+ * Neither of those would be hard to change, but we'd need to be able to
+ * deal with shadows made in one mode and used in another.
+ */
+
+#define FETCH_TYPE_PREFETCH 1
+#define FETCH_TYPE_DEMAND 2
+#define FETCH_TYPE_WRITE 4
+typedef enum {
+ ft_prefetch = FETCH_TYPE_PREFETCH,
+ ft_demand_read = FETCH_TYPE_DEMAND,
+ ft_demand_write = FETCH_TYPE_DEMAND | FETCH_TYPE_WRITE,
+} fetch_type_t;
+
+#ifdef DEBUG_TRACE_DUMP
+static char *fetch_type_names[] = {
+ [ft_prefetch] "prefetch",
+ [ft_demand_read] "demand read",
+ [ft_demand_write] "demand write",
+};
+#endif
+
+/* XXX forward declarations */
+#if (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3)
+static unsigned long hvm_pae_copy_root(struct vcpu *v, l3_pgentry_t *l3tab, int clear_res);
+#endif
+static inline void sh_update_linear_entries(struct vcpu *v);
+
+/**************************************************************************/
+/* Hash table mapping from guest pagetables to shadows
+ *
+ * Normal case: maps the mfn of a guest page to the mfn of its shadow page.
+ * FL1's: maps the *gfn* of the start of a superpage to the mfn of a
+ * shadow L1 which maps its "splinters".
+ * PAE CR3s: maps the 32-byte aligned, 32-bit CR3 value to the mfn of the
+ * PAE L3 info page for that CR3 value.
+ */
+
+static inline mfn_t
+get_fl1_shadow_status(struct vcpu *v, gfn_t gfn)
+/* Look for FL1 shadows in the hash table */
+{
+ mfn_t smfn = shadow_hash_lookup(v, gfn_x(gfn),
+ PGC_SH_fl1_shadow >> PGC_SH_type_shift);
+
+ if ( unlikely(shadow_mode_log_dirty(v->domain) && valid_mfn(smfn)) )
+ {
+ struct page_info *page = mfn_to_page(smfn);
+ if ( !(page->count_info & PGC_SH_log_dirty) )
+ shadow_convert_to_log_dirty(v, smfn);
+ }
+
+ return smfn;
+}
+
+static inline mfn_t
+get_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
+/* Look for shadows in the hash table */
+{
+ mfn_t smfn = shadow_hash_lookup(v, mfn_x(gmfn),
+ shadow_type >> PGC_SH_type_shift);
+ perfc_incrc(shadow_get_shadow_status);
+
+ if ( unlikely(shadow_mode_log_dirty(v->domain) && valid_mfn(smfn)) )
+ {
+ struct page_info *page = mfn_to_page(smfn);
+ if ( !(page->count_info & PGC_SH_log_dirty) )
+ shadow_convert_to_log_dirty(v, smfn);
+ }
+
+ return smfn;
+}
+
+static inline void
+set_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
+/* Put an FL1 shadow into the hash table */
+{
+ SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n",
+ gfn_x(gfn), PGC_SH_fl1_shadow, mfn_x(smfn));
+
+ if ( unlikely(shadow_mode_log_dirty(v->domain)) )
+ // mark this shadow as a log dirty shadow...
+ set_bit(_PGC_SH_log_dirty, &mfn_to_page(smfn)->count_info);
+ else
+ clear_bit(_PGC_SH_log_dirty, &mfn_to_page(smfn)->count_info);
+
+ shadow_hash_insert(v, gfn_x(gfn),
+ PGC_SH_fl1_shadow >> PGC_SH_type_shift, smfn);
+}
+
+static inline void
+set_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
+/* Put a shadow into the hash table */
+{
+ struct domain *d = v->domain;
+ int res;
+
+ SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
+ d->domain_id, v->vcpu_id, mfn_x(gmfn),
+ shadow_type, mfn_x(smfn));
+
+ if ( unlikely(shadow_mode_log_dirty(d)) )
+ // mark this shadow as a log dirty shadow...
+ set_bit(_PGC_SH_log_dirty, &mfn_to_page(smfn)->count_info);
+ else
+ clear_bit(_PGC_SH_log_dirty, &mfn_to_page(smfn)->count_info);
+
+ res = get_page(mfn_to_page(gmfn), d);
+ ASSERT(res == 1);
+
+ shadow_hash_insert(v, mfn_x(gmfn), shadow_type >> PGC_SH_type_shift,
+ smfn);
+}
+
+static inline void
+delete_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
+/* Remove a shadow from the hash table */
+{
+ SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n",
+ gfn_x(gfn), PGC_SH_fl1_shadow, mfn_x(smfn));
+
+ shadow_hash_delete(v, gfn_x(gfn),
+ PGC_SH_fl1_shadow >> PGC_SH_type_shift, smfn);
+}
+
+static inline void
+delete_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
+/* Remove a shadow from the hash table */
+{
+ SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
+ v->domain->domain_id, v->vcpu_id,
+ mfn_x(gmfn), shadow_type, mfn_x(smfn));
+ shadow_hash_delete(v, mfn_x(gmfn),
+ shadow_type >> PGC_SH_type_shift, smfn);
+ put_page(mfn_to_page(gmfn));
+}
+
+/**************************************************************************/
+/* CPU feature support querying */
+
+static inline int
+guest_supports_superpages(struct vcpu *v)
+{
+ /* The _PAGE_PSE bit must be honoured in HVM guests, whenever
+ * CR4.PSE is set or the guest is in PAE or long mode */
+ return (hvm_guest(v) && (GUEST_PAGING_LEVELS != 2
+ || (hvm_get_guest_ctrl_reg(v, 4) & X86_CR4_PSE)));
+}
+
+static inline int
+guest_supports_nx(struct vcpu *v)
+{
+ if ( !hvm_guest(v) )
+ return cpu_has_nx;
+
+ // XXX - fix this!
+ return 1;
+}
+
+
+/**************************************************************************/
+/* Functions for walking the guest page tables */
+
+
+/* Walk the guest pagetables, filling the walk_t with what we see.
+ * Takes an uninitialised walk_t. The caller must call unmap_walk()
+ * on the walk_t before discarding it or calling guest_walk_tables again.
+ * If "guest_op" is non-zero, we are serving a genuine guest memory access,
+ * and must (a) be under the shadow lock, and (b) remove write access
+ * from any gueat PT pages we see, as we will be using their contents to
+ * perform shadow updates.
+ * Returns 0 for success or non-zero if the guest pagetables are malformed.
+ * N.B. Finding a not-present entry does not cause a non-zero return code. */
+static inline int
+guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw, int guest_op)
+{
+ ASSERT(!guest_op || shadow_lock_is_acquired(v->domain));
+
+ perfc_incrc(shadow_guest_walk);
+ memset(gw, 0, sizeof(*gw));
+ gw->va = va;
+
+#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
+#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
+ /* Get l4e from the top level table */
+ gw->l4mfn = pagetable_get_mfn(v->arch.guest_table);
+ gw->l4e = (guest_l4e_t *)v->arch.guest_vtable + guest_l4_table_offset(va);
+ /* Walk down to the l3e */
+ if ( !(guest_l4e_get_flags(*gw->l4e) & _PAGE_PRESENT) ) return 0;
+ gw->l3mfn = vcpu_gfn_to_mfn(v, guest_l4e_get_gfn(*gw->l4e));
+ if ( !valid_mfn(gw->l3mfn) ) return 1;
+ /* This mfn is a pagetable: make sure the guest can't write to it. */
+ if ( guest_op && shadow_remove_write_access(v, gw->l3mfn, 3, va) != 0 )
+ flush_tlb_mask(v->domain->domain_dirty_cpumask);
+ gw->l3e = ((guest_l3e_t *)sh_map_domain_page(gw->l3mfn))
+ + guest_l3_table_offset(va);
+#else /* PAE only... */
+ /* Get l3e from the top level table */
+ gw->l3mfn = pagetable_get_mfn(v->arch.guest_table);
+ gw->l3e = (guest_l3e_t *)v->arch.guest_vtable + guest_l3_table_offset(va);
+#endif /* PAE or 64... */
+ /* Walk down to the l2e */
+ if ( !(guest_l3e_get_flags(*gw->l3e) & _PAGE_PRESENT) ) return 0;
+ gw->l2mfn = vcpu_gfn_to_mfn(v, guest_l3e_get_gfn(*gw->l3e));
+ if ( !valid_mfn(gw->l2mfn) ) return 1;
+ /* This mfn is a pagetable: make sure the guest can't write to it. */
+ if ( guest_op && shadow_remove_write_access(v, gw->l2mfn, 2, va) != 0 )
+ flush_tlb_mask(v->domain->domain_dirty_cpumask);
+ gw->l2e = ((guest_l2e_t *)sh_map_domain_page(gw->l2mfn))
+ + guest_l2_table_offset(va);
+#else /* 32-bit only... */
+ /* Get l2e from the top level table */
+ gw->l2mfn = pagetable_get_mfn(v->arch.guest_table);
+ gw->l2e = (guest_l2e_t *)v->arch.guest_vtable + guest_l2_table_offset(va);
+#endif /* All levels... */
+
+ if ( !(guest_l2e_get_flags(*gw->l2e) & _PAGE_PRESENT) ) return 0;
+ if ( guest_supports_superpages(v) &&
+ (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE) )
+ {
+ /* Special case: this guest VA is in a PSE superpage, so there's
+ * no guest l1e. We make one up so that the propagation code
+ * can generate a shadow l1 table. Start with the gfn of the
+ * first 4k-page of the superpage. */
+ gfn_t start = guest_l2e_get_gfn(*gw->l2e);
+ /* Grant full access in the l1e, since all the guest entry's
+ * access controls are enforced in the shadow l2e. This lets
+ * us reflect l2 changes later without touching the l1s. */
+ int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
+ _PAGE_ACCESSED|_PAGE_DIRTY);
+ /* PSE level 2 entries use bit 12 for PAT; propagate it to bit 7
+ * of the level 1 */
+ if ( (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE_PAT) )
+ flags |= _PAGE_PAT;
+ /* Increment the pfn by the right number of 4k pages.
+ * The ~0x1 is to mask out the PAT bit mentioned above. */
+ start = _gfn((gfn_x(start) & ~0x1) + guest_l1_table_offset(va));
+ gw->eff_l1e = guest_l1e_from_gfn(start, flags);
+ gw->l1e = NULL;
+ gw->l1mfn = _mfn(INVALID_MFN);
+ }
+ else
+ {
+ /* Not a superpage: carry on and find the l1e. */
+ gw->l1mfn = vcpu_gfn_to_mfn(v, guest_l2e_get_gfn(*gw->l2e));
+ if ( !valid_mfn(gw->l1mfn) ) return 1;
+ /* This mfn is a pagetable: make sure the guest can't write to it. */
+ if ( guest_op
+ && shadow_remove_write_access(v, gw->l1mfn, 1, va) != 0 )
+ flush_tlb_mask(v->domain->domain_dirty_cpumask);
+ gw->l1e = ((guest_l1e_t *)sh_map_domain_page(gw->l1mfn))
+ + guest_l1_table_offset(va);
+ gw->eff_l1e = *gw->l1e;
+ }
+
+ return 0;
+}
+
+/* Given a walk_t, translate the gw->va into the guest's notion of the
+ * corresponding frame number. */
+static inline gfn_t
+guest_walk_to_gfn(walk_t *gw)
+{
+ if ( !(guest_l1e_get_flags(gw->eff_l1e) & _PAGE_PRESENT) )
+ return _gfn(INVALID_GFN);
+ return guest_l1e_get_gfn(gw->eff_l1e);
+}
+
+/* Given a walk_t, translate the gw->va into the guest's notion of the
+ * corresponding physical address. */
+static inline paddr_t
+guest_walk_to_gpa(walk_t *gw)
+{
+ if ( !(guest_l1e_get_flags(gw->eff_l1e) & _PAGE_PRESENT) )
+ return 0;
+ return guest_l1e_get_paddr(gw->eff_l1e) + (gw->va & ~PAGE_MASK);
+}
+
+
+/* Unmap (and reinitialise) a guest walk.
+ * Call this to dispose of any walk filled in by guest_walk_tables() */
+static void unmap_walk(struct vcpu *v, walk_t *gw)
+{
+#if GUEST_PAGING_LEVELS >= 3
+#if GUEST_PAGING_LEVELS >= 4
+ if ( gw->l3e != NULL ) sh_unmap_domain_page(gw->l3e);
+#endif
+ if ( gw->l2e != NULL ) sh_unmap_domain_page(gw->l2e);
+#endif
+ if ( gw->l1e != NULL ) sh_unmap_domain_page(gw->l1e);
+#ifdef DEBUG
+ memset(gw, 0, sizeof(*gw));
+#endif
+}
+
+
+/* Pretty-print the contents of a guest-walk */
+static inline void print_gw(walk_t *gw)
+{
+ SHADOW_PRINTK("GUEST WALK TO %#lx:\n", gw->va);
+#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
+#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
+ SHADOW_PRINTK(" l4mfn=%" SH_PRI_mfn "\n", mfn_x(gw->l4mfn));
+ SHADOW_PRINTK(" l4e=%p\n", gw->l4e);
+ if ( gw->l4e )
+ SHADOW_PRINTK(" *l4e=%" SH_PRI_gpte "\n", gw->l4e->l4);
+#endif /* PAE or 64... */
+ SHADOW_PRINTK(" l3mfn=%" SH_PRI_mfn "\n", mfn_x(gw->l3mfn));
+ SHADOW_PRINTK(" l3e=%p\n", gw->l3e);
+ if ( gw->l3e )
+ SHADOW_PRINTK(" *l3e=%" SH_PRI_gpte "\n", gw->l3e->l3);
+#endif /* All levels... */
+ SHADOW_PRINTK(" l2mfn=%" SH_PRI_mfn "\n", mfn_x(gw->l2mfn));
+ SHADOW_PRINTK(" l2e=%p\n", gw->l2e);
+ if ( gw->l2e )
+ SHADOW_PRINTK(" *l2e=%" SH_PRI_gpte "\n", gw->l2e->l2);
+ SHADOW_PRINTK(" l1mfn=%" SH_PRI_mfn "\n", mfn_x(gw->l1mfn));
+ SHADOW_PRINTK(" l1e=%p\n", gw->l1e);
+ if ( gw->l1e )
+ SHADOW_PRINTK(" *l1e=%" SH_PRI_gpte "\n", gw->l1e->l1);
+ SHADOW_PRINTK(" eff_l1e=%" SH_PRI_gpte "\n", gw->eff_l1e.l1);
+}
+
+
+#if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
+/* Lightweight audit: pass all the shadows associated with this guest walk
+ * through the audit mechanisms */
+static void sh_audit_gw(struct vcpu *v, walk_t *gw)
+{
+ mfn_t smfn;
+
+ if ( !(SHADOW_AUDIT_ENABLE) )
+ return;
+
+#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
+#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
+ if ( valid_mfn(gw->l4mfn)
+ && valid_mfn((smfn = get_shadow_status(v, gw->l4mfn,
+ PGC_SH_l4_shadow))) )
+ (void) sh_audit_l4_table(v, smfn, _mfn(INVALID_MFN));
+#endif /* PAE or 64... */
+ if ( valid_mfn(gw->l3mfn)
+ && valid_mfn((smfn = get_shadow_status(v, gw->l3mfn,
+ PGC_SH_l3_shadow))) )
+ (void) sh_audit_l3_table(v, smfn, _mfn(INVALID_MFN));
+#endif /* All levels... */
+ if ( valid_mfn(gw->l2mfn) )
+ {
+ if ( valid_mfn((smfn = get_shadow_status(v, gw->l2mfn,
+ PGC_SH_l2_shadow))) )
+ (void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
+#if GUEST_PAGING_LEVELS == 3
+ if ( valid_mfn((smfn = get_shadow_status(v, gw->l2mfn,
+ PGC_SH_l2h_shadow))) )
+ (void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
+#endif
+ }
+ if ( valid_mfn(gw->l1mfn)
+ && valid_mfn((smfn = get_shadow_status(v, gw->l1mfn,
+ PGC_SH_l1_shadow))) )
+ (void) sh_audit_l1_table(v, smfn, _mfn(INVALID_MFN));
+ else if ( gw->l2e
+ && (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE)
+ && valid_mfn(
+ (smfn = get_fl1_shadow_status(v, guest_l2e_get_gfn(*gw->l2e)))) )
+ (void) sh_audit_fl1_table(v, smfn, _mfn(INVALID_MFN));
+}
+
+#else
+#define sh_audit_gw(_v, _gw) do {} while(0)
+#endif /* audit code */
+
+
+
+/**************************************************************************/
+/* Function to write to the guest tables, for propagating accessed and
+ * dirty bits from the shadow to the guest.
+ * Takes a guest mfn, a pointer to the guest entry, the level of pagetable,
+ * and an operation type. The guest entry is always passed as an l1e:
+ * since we only ever write flags, that's OK.
+ * Returns the new flag bits of the guest entry. */
+
+static u32 guest_set_ad_bits(struct vcpu *v,
+ mfn_t gmfn,
+ guest_l1e_t *ep,
+ unsigned int level,
+ fetch_type_t ft)
+{
+ u32 flags, shflags, bit;
+ struct page_info *pg;
+ int res = 0;
+
+ ASSERT(valid_mfn(gmfn)
+ && (sh_mfn_is_a_page_table(gmfn)
+ || ((mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask)
+ == 0)));
+ ASSERT(ep && !(((unsigned long)ep) & ((sizeof *ep) - 1)));
+ ASSERT(level <= GUEST_PAGING_LEVELS);
+ ASSERT(ft == ft_demand_read || ft == ft_demand_write);
+ ASSERT(shadow_lock_is_acquired(v->domain));
+
+ flags = guest_l1e_get_flags(*ep);
+
+ /* PAE l3s do not have A and D bits */
+ if ( unlikely(GUEST_PAGING_LEVELS == 3 && level == 3) )
+ return flags;
+
+ /* Need the D bit as well for writes, in l1es and 32bit/PAE PSE l2es. */
+ if ( ft == ft_demand_write
+ && (level == 1 ||
+ (level == 2 && GUEST_PAGING_LEVELS < 4
+ && (flags & _PAGE_PSE) && guest_supports_superpages(v))) )
+ {
+ if ( (flags & (_PAGE_DIRTY | _PAGE_ACCESSED))
+ == (_PAGE_DIRTY | _PAGE_ACCESSED) )
+ return flags; /* Guest already has A and D bits set */
+ flags |= _PAGE_DIRTY | _PAGE_ACCESSED;
+ perfc_incrc(shadow_ad_update);
+ }
+ else
+ {
+ if ( flags & _PAGE_ACCESSED )
+ return flags; /* Guest already has A bit set */
+ flags |= _PAGE_ACCESSED;
+ perfc_incrc(shadow_a_update);
+ }
+
+ /* Set the bit(s) */
+ sh_mark_dirty(v->domain, gmfn);
+ SHADOW_DEBUG(A_AND_D, "gfn = %"SH_PRI_gfn", "
+ "old flags = %#x, new flags = %#x\n",
+ guest_l1e_get_gfn(*ep), guest_l1e_get_flags(*ep), flags);
+ *ep = guest_l1e_from_gfn(guest_l1e_get_gfn(*ep), flags);
+
+ /* May need to propagate this change forward to other kinds of shadow */
+ pg = mfn_to_page(gmfn);
+ if ( !sh_mfn_is_a_page_table(gmfn) )
+ {
+ /* This guest pagetable is not yet shadowed at all. */
+ // MAF: I think this assert is busted... If this gmfn has not yet
+ // been promoted, then it seems perfectly reasonable for there to be
+ // outstanding type refs to it...
+ /* TJD: No. If the gmfn has not been promoted, we must at least
+ * have recognised that it is a pagetable, and pulled write access.
+ * The type count should only be non-zero if it is actually a page
+ * table. The test above was incorrect, though, so I've fixed it. */
+ ASSERT((pg->u.inuse.type_info & PGT_count_mask) == 0);
+ return flags;
+ }
+
+ shflags = pg->shadow_flags & SHF_page_type_mask;
+ while ( shflags )
+ {
+ bit = find_first_set_bit(shflags);
+ ASSERT(shflags & (1u << bit));
+ shflags &= ~(1u << bit);
+ if ( !(pg->shadow_flags & (1u << bit)) )
+ continue;
+ switch ( bit )
+ {
+ case PGC_SH_type_to_index(PGC_SH_l1_shadow):
+ if (level != 1)
+ res |= sh_map_and_validate_gl1e(v, gmfn, ep, sizeof (*ep));
+ break;
+ case PGC_SH_type_to_index(PGC_SH_l2_shadow):
+ if (level != 2)
+ res |= sh_map_and_validate_gl2e(v, gmfn, ep, sizeof (*ep));
+ break;
+#if GUEST_PAGING_LEVELS == 3 /* PAE only */
+ case PGC_SH_type_to_index(PGC_SH_l2h_shadow):
+ if (level != 2)
+ res |= sh_map_and_validate_gl2he(v, gmfn, ep, sizeof (*ep));
+ break;
+#endif
+#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
+ case PGC_SH_type_to_index(PGC_SH_l3_shadow):
+ if (level != 3)
+ res |= sh_map_and_validate_gl3e(v, gmfn, ep, sizeof (*ep));
+ break;
+#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
+ case PGC_SH_type_to_index(PGC_SH_l4_shadow):
+ if (level != 4)
+ res |= sh_map_and_validate_gl4e(v, gmfn, ep, sizeof (*ep));
+ break;
+#endif
+#endif
+ default:
+ SHADOW_ERROR("mfn %"SH_PRI_mfn" is shadowed in multiple "
+ "modes: A&D bits may be out of sync (flags=%#x).\n",
+ mfn_x(gmfn), pg->shadow_flags);
+ /* XXX Shadows in other modes will not be updated, so will
+ * have their A and D bits out of sync. */
+ }
+ }
+
+ /* We should never need to flush the TLB or recopy PAE entries */
+ ASSERT( res == 0 || res == SHADOW_SET_CHANGED );
+ return flags;
+}
+
+/**************************************************************************/
+/* Functions to compute the correct index into a shadow page, given an
+ * index into the guest page (as returned by guest_get_index()).
+ * This is trivial when the shadow and guest use the same sized PTEs, but
+ * gets more interesting when those sizes are mismatched (e.g. 32-bit guest,
+ * PAE- or 64-bit shadows).
+ *
+ * These functions also increment the shadow mfn, when necessary. When PTE
+ * sizes are mismatched, it takes 2 shadow L1 pages for a single guest L1
+ * page. In this case, we allocate 2 contiguous pages for the shadow L1, and
+ * use simple pointer arithmetic on a pointer to the guest L1e to figure out
+ * which shadow page we really want. Similarly, when PTE sizes are
+ * mismatched, we shadow a guest L2 page with 4 shadow L2 pages. (The easiest
+ * way to see this is: a 32-bit guest L2 page maps 4GB of virtual address
+ * space, while a PAE- or 64-bit shadow L2 page maps 1GB of virtual address
+ * space.)
+ *
+ * For PAE guests, for every 32-bytes of guest L3 page table, we use 64-bytes
+ * of shadow (to store both the shadow, and the info that would normally be
+ * stored in page_info fields). This arrangement allows the shadow and the
+ * "page_info" fields to always be stored in the same page (in fact, in
+ * the same cache line), avoiding an extra call to map_domain_page().
+ */
+
+static inline u32
+guest_index(void *ptr)
+{
+ return (u32)((unsigned long)ptr & ~PAGE_MASK) / sizeof(guest_l1e_t);
+}
+
+static inline u32
+shadow_l1_index(mfn_t *smfn, u32 guest_index)
+{
+#if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2)
+ *smfn = _mfn(mfn_x(*smfn) +
+ (guest_index / SHADOW_L1_PAGETABLE_ENTRIES));
+ return (guest_index % SHADOW_L1_PAGETABLE_ENTRIES);
+#else
+ return guest_index;
+#endif
+}
+
+static inline u32
+shadow_l2_index(mfn_t *smfn, u32 guest_index)
+{
+#if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2)
+ // Because we use 2 shadow l2 entries for each guest entry, the number of
+ // guest entries per shadow page is SHADOW_L2_PAGETABLE_ENTRIES/2
+ //
+ *smfn = _mfn(mfn_x(*smfn) +
+ (guest_index / (SHADOW_L2_PAGETABLE_ENTRIES / 2)));
+
+ // We multiple by two to get the index of the first of the two entries
+ // used to shadow the specified guest entry.
+ return (guest_index % (SHADOW_L2_PAGETABLE_ENTRIES / 2)) * 2;
+#else
+ return guest_index;
+#endif
+}
+
+#if GUEST_PAGING_LEVELS >= 3
+
+static inline u32
+shadow_l3_index(mfn_t *smfn, u32 guest_index)
+{
+#if GUEST_PAGING_LEVELS == 3
+ u32 group_id;
+
+ // Because we use twice the space in L3 shadows as was consumed in guest
+ // L3s, the number of guest entries per shadow page is
+ // SHADOW_L2_PAGETABLE_ENTRIES/2. (Note this is *not*
+ // SHADOW_L3_PAGETABLE_ENTRIES, which in this case is 4...)
+ //
+ *smfn = _mfn(mfn_x(*smfn) +
+ (guest_index / (SHADOW_L2_PAGETABLE_ENTRIES / 2)));
+
+ // We store PAE L3 shadows in groups of 4, alternating shadows and
+ // pae_l3_bookkeeping structs. So the effective shadow index is
+ // the the group_id * 8 + the offset within the group.
+ //
+ guest_index %= (SHADOW_L2_PAGETABLE_ENTRIES / 2);
+ group_id = guest_index / 4;
+ return (group_id * 8) + (guest_index % 4);
+#else
+ return guest_index;
+#endif
+}
+
+#endif // GUEST_PAGING_LEVELS >= 3
+
+#if GUEST_PAGING_LEVELS >= 4
+
+static inline u32
+shadow_l4_index(mfn_t *smfn, u32 guest_index)
+{
+ return guest_index;
+}
+
+#endif // GUEST_PAGING_LEVELS >= 4
+
+
+/**************************************************************************/
+/* Functions which compute shadow entries from their corresponding guest
+ * entries.
+ *
+ * These are the "heart" of the shadow code.
+ *
+ * There are two sets of these: those that are called on demand faults (read
+ * faults and write faults), and those that are essentially called to
+ * "prefetch" (or propagate) entries from the guest into the shadow. The read
+ * fault and write fault are handled as two separate cases for L1 entries (due
+ * to the _PAGE_DIRTY bit handling), but for L[234], they are grouped together
+ * into the respective demand_fault functions.
+ */
+
+#define CHECK(_cond) \
+do { \
+ if (unlikely(!(_cond))) \
+ { \
+ printk("%s %s %d ASSERTION (%s) FAILED\n", \
+ __func__, __FILE__, __LINE__, #_cond); \
+ return -1; \
+ } \
+} while (0);
+
+// The function below tries to capture all of the flag manipulation for the
+// demand and propagate functions into one place.
+//
+static always_inline u32
+sh_propagate_flags(struct vcpu *v, mfn_t target_mfn,
+ u32 gflags, guest_l1e_t *guest_entry_ptr, mfn_t gmfn,
+ int mmio, int level, fetch_type_t ft)
+{
+ struct domain *d = v->domain;
+ u32 pass_thru_flags;
+ u32 sflags;
+
+ // XXX -- might want to think about PAT support for HVM guests...
+
+#ifndef NDEBUG
+ // MMIO can only occur from L1e's
+ //
+ if ( mmio )
+ CHECK(level == 1);
+
+ // We should always have a pointer to the guest entry if it's a non-PSE
+ // non-MMIO demand access.
+ if ( ft & FETCH_TYPE_DEMAND )
+ CHECK(guest_entry_ptr || level == 1);
+#endif
+
+ // A not-present guest entry has a special signature in the shadow table,
+ // so that we do not have to consult the guest tables multiple times...
+ //
+ if ( unlikely(!(gflags & _PAGE_PRESENT)) )
+ return _PAGE_SHADOW_GUEST_NOT_PRESENT;
+
+ // Must have a valid target_mfn, unless this is mmio, or unless this is a
+ // prefetch. In the case of a prefetch, an invalid mfn means that we can
+ // not usefully shadow anything, and so we return early.
+ //
+ if ( !valid_mfn(target_mfn) )
+ {
+ CHECK((ft == ft_prefetch) || mmio);
+ if ( !mmio )
+ return 0;
+ }
+
+ // PAE does not allow NX, RW, USER, ACCESSED, or DIRTY bits in its L3e's...
+ //
+ if ( (SHADOW_PAGING_LEVELS == 3) && (level == 3) )
+ pass_thru_flags = _PAGE_PRESENT;
+ else
+ {
+ pass_thru_flags = (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_USER |
+ _PAGE_RW | _PAGE_PRESENT);
+ if ( guest_supports_nx(v) )
+ pass_thru_flags |= _PAGE_NX_BIT;
+ }
+
+ // PAE guests can not put NX, RW, USER, ACCESSED, or DIRTY bits into their
+ // L3e's; they are all implied. So we emulate them here.
+ //
+ if ( (GUEST_PAGING_LEVELS == 3) && (level == 3) )
+ gflags = pass_thru_flags;
+
+ // Propagate bits from the guest to the shadow.
+ // Some of these may be overwritten, below.
+ // Since we know the guest's PRESENT bit is set, we also set the shadow's
+ // SHADOW_PRESENT bit.
+ //
+ sflags = (gflags & pass_thru_flags) | _PAGE_SHADOW_PRESENT;
+
+ // Copy the guest's RW bit into the SHADOW_RW bit.
+ //
+ if ( gflags & _PAGE_RW )
+ sflags |= _PAGE_SHADOW_RW;
+
+ // Set the A&D bits for higher level shadows.
+ // Higher level entries do not, strictly speaking, have dirty bits, but
+ // since we use shadow linear tables, each of these entries may, at some
+ // point in time, also serve as a shadow L1 entry.
+ // By setting both the A&D bits in each of these, we eliminate the burden
+ // on the hardware to update these bits on initial accesses.
+ //
+ if ( (level > 1) && !((SHADOW_PAGING_LEVELS == 3) && (level == 3)) )
+ sflags |= _PAGE_ACCESSED | _PAGE_DIRTY;
+
+
+ // Set the A and D bits in the guest entry, if we need to.
+ if ( guest_entry_ptr && (ft & FETCH_TYPE_DEMAND) )
+ gflags = guest_set_ad_bits(v, gmfn, guest_entry_ptr, level, ft);
+
+ // If the A or D bit has not yet been set in the guest, then we must
+ // prevent the corresponding kind of access.
+ //
+ if ( unlikely(!((GUEST_PAGING_LEVELS == 3) && (level == 3)) &&
+ !(gflags & _PAGE_ACCESSED)) )
+ sflags &= ~_PAGE_PRESENT;
+
+ /* D bits exist in l1es, and 32bit/PAE PSE l2es, but not 64bit PSE l2es */
+ if ( unlikely( ((level == 1)
+ || ((level == 2) && (GUEST_PAGING_LEVELS < 4)
+ && guest_supports_superpages(v) &&
+ (gflags & _PAGE_PSE)))
+ && !(gflags & _PAGE_DIRTY)) )
+ sflags &= ~_PAGE_RW;
+
+ // MMIO caching
+ //
+ // MMIO mappings are marked as not present, but we set the SHADOW_MMIO bit
+ // to cache the fact that this entry is in MMIO space.
+ //
+ if ( (level == 1) && mmio )
+ {
+ sflags &= ~(_PAGE_PRESENT);
+ sflags |= _PAGE_SHADOW_MMIO;
+ }
+ else
+ {
+ // shadow_mode_log_dirty support
+ //
+ // Only allow the guest write access to a page a) on a demand fault,
+ // or b) if the page is already marked as dirty.
+ //
+ if ( unlikely((level == 1) &&
+ !(ft & FETCH_TYPE_WRITE) &&
+ shadow_mode_log_dirty(d) &&
+ !sh_mfn_is_dirty(d, target_mfn)) )
+ {
+ sflags &= ~_PAGE_RW;
+ }
+
+ // protect guest page tables
+ //
+ if ( unlikely((level == 1) &&
+ sh_mfn_is_a_page_table(target_mfn)) )
+ {
+ if ( shadow_mode_trap_reads(d) )
+ {
+ // if we are trapping both reads & writes, then mark this page
+ // as not present...
+ //
+ sflags &= ~_PAGE_PRESENT;
+ }
+ else
+ {
+ // otherwise, just prevent any writes...
+ //
+ sflags &= ~_PAGE_RW;
+ }
+ }
+ }
+
+ return sflags;
+}
+
+#undef CHECK
+
+#if GUEST_PAGING_LEVELS >= 4
+static void
+l4e_propagate_from_guest(struct vcpu *v,
+ guest_l4e_t *gl4e,
+ mfn_t gl4mfn,
+ mfn_t sl3mfn,
+ shadow_l4e_t *sl4p,
+ fetch_type_t ft)
+{
+ u32 gflags = guest_l4e_get_flags(*gl4e);
+ u32 sflags = sh_propagate_flags(v, sl3mfn, gflags, (guest_l1e_t *) gl4e,
+ gl4mfn, 0, 4, ft);
+
+ *sl4p = shadow_l4e_from_mfn(sl3mfn, sflags);
+
+ SHADOW_DEBUG(PROPAGATE,
+ "%s gl4e=%" SH_PRI_gpte " sl4e=%" SH_PRI_pte "\n",
+ fetch_type_names[ft], gl4e->l4, sl4p->l4);
+ ASSERT(sflags != -1);
+}
+#endif // GUEST_PAGING_LEVELS >= 4
+
+#if GUEST_PAGING_LEVELS >= 3
+static void
+l3e_propagate_from_guest(struct vcpu *v,
+ guest_l3e_t *gl3e,
+ mfn_t gl3mfn,
+ mfn_t sl2mfn,
+ shadow_l3e_t *sl3p,
+ fetch_type_t ft)
+{
+ u32 gflags = guest_l3e_get_flags(*gl3e);
+ u32 sflags = sh_propagate_flags(v, sl2mfn, gflags, (guest_l1e_t *) gl3e,
+ gl3mfn, 0, 3, ft);
+
+ *sl3p = shadow_l3e_from_mfn(sl2mfn, sflags);
+
+ SHADOW_DEBUG(PROPAGATE,
+ "%s gl3e=%" SH_PRI_gpte " sl3e=%" SH_PRI_pte "\n",
+ fetch_type_names[ft], gl3e->l3, sl3p->l3);
+ ASSERT(sflags != -1);
+}
+#endif // GUEST_PAGING_LEVELS >= 3
+
+static void
+l2e_propagate_from_guest(struct vcpu *v,
+ guest_l2e_t *gl2e,
+ mfn_t gl2mfn,
+ mfn_t sl1mfn,
+ shadow_l2e_t *sl2p,
+ fetch_type_t ft)
+{
+ u32 gflags = guest_l2e_get_flags(*gl2e);
+ u32 sflags = sh_propagate_flags(v, sl1mfn, gflags, (guest_l1e_t *) gl2e,
+ gl2mfn, 0, 2, ft);
+
+ *sl2p = shadow_l2e_from_mfn(sl1mfn, sflags);
+
+ SHADOW_DEBUG(PROPAGATE,
+ "%s gl2e=%" SH_PRI_gpte " sl2e=%" SH_PRI_pte "\n",
+ fetch_type_names[ft], gl2e->l2, sl2p->l2);
+ ASSERT(sflags != -1);
+}
+
+static inline int
+l1e_read_fault(struct vcpu *v, walk_t *gw, mfn_t gmfn, shadow_l1e_t *sl1p,
+ int mmio)
+/* returns 1 if emulation is required, and 0 otherwise */
+{
+ struct domain *d = v->domain;
+ u32 gflags = guest_l1e_get_flags(gw->eff_l1e);
+ u32 sflags = sh_propagate_flags(v, gmfn, gflags, gw->l1e, gw->l1mfn,
+ mmio, 1, ft_demand_read);
+
+ if ( shadow_mode_trap_reads(d) && !mmio && sh_mfn_is_a_page_table(gmfn) )
+ {
+ // emulation required!
+ *sl1p = shadow_l1e_empty();
+ return 1;
+ }
+
+ *sl1p = shadow_l1e_from_mfn(gmfn, sflags);
+
+ SHADOW_DEBUG(PROPAGATE,
+ "va=%p eff_gl1e=%" SH_PRI_gpte " sl1e=%" SH_PRI_pte "\n",
+ (void *)gw->va, gw->eff_l1e.l1, sl1p->l1);
+
+ ASSERT(sflags != -1);
+ return 0;
+}
+
+static inline int
+l1e_write_fault(struct vcpu *v, walk_t *gw, mfn_t gmfn, shadow_l1e_t *sl1p,
+ int mmio)
+/* returns 1 if emulation is required, and 0 otherwise */
+{
+ struct domain *d = v->domain;
+ u32 gflags = guest_l1e_get_flags(gw->eff_l1e);
+ u32 sflags = sh_propagate_flags(v, gmfn, gflags, gw->l1e, gw->l1mfn,
+ mmio, 1, ft_demand_write);
+
+ sh_mark_dirty(d, gmfn);
+
+ if ( !mmio && sh_mfn_is_a_page_table(gmfn) )
+ {
+ // emulation required!
+ *sl1p = shadow_l1e_empty();
+ return 1;
+ }
+
+ *sl1p = shadow_l1e_from_mfn(gmfn, sflags);
+
+ SHADOW_DEBUG(PROPAGATE,
+ "va=%p eff_gl1e=%" SH_PRI_gpte " sl1e=%" SH_PRI_pte "\n",
+ (void *)gw->va, gw->eff_l1e.l1, sl1p->l1);
+
+ ASSERT(sflags != -1);
+ return 0;
+}
+
+static inline void
+l1e_propagate_from_guest(struct vcpu *v, guest_l1e_t gl1e, shadow_l1e_t *sl1p,
+ int mmio)
+{
+ gfn_t gfn = guest_l1e_get_gfn(gl1e);
+ mfn_t gmfn = (mmio) ? _mfn(gfn_x(gfn)) : vcpu_gfn_to_mfn(v, gfn);
+ u32 gflags = guest_l1e_get_flags(gl1e);
+ u32 sflags = sh_propagate_flags(v, gmfn, gflags, 0, _mfn(INVALID_MFN),
+ mmio, 1, ft_prefetch);
+
+ *sl1p = shadow_l1e_from_mfn(gmfn, sflags);
+
+ SHADOW_DEBUG(PROPAGATE,
+ "gl1e=%" SH_PRI_gpte " sl1e=%" SH_PRI_pte "\n",
+ gl1e.l1, sl1p->l1);
+
+ ASSERT(sflags != -1);
+}
+
+
+/**************************************************************************/
+/* These functions update shadow entries (and do bookkeeping on the shadow
+ * tables they are in). It is intended that they are the only
+ * functions which ever write (non-zero) data onto a shadow page.
+ *
+ * They return a set of flags:
+ * SHADOW_SET_CHANGED -- we actually wrote a new value to the shadow.
+ * SHADOW_SET_FLUSH -- the caller must cause a TLB flush.
+ * SHADOW_SET_ERROR -- the input is not a valid entry (for example, if
+ * shadow_get_page_from_l1e() fails).
+ * SHADOW_SET_L3PAE_RECOPY -- one or more vcpu's need to have their local
+ * copies of their PAE L3 entries re-copied.
+ */
+
+static inline void safe_write_entry(void *dst, void *src)
+/* Copy one PTE safely when processors might be running on the
+ * destination pagetable. This does *not* give safety against
+ * concurrent writes (that's what the shadow lock is for), just
+ * stops the hardware picking up partially written entries. */
+{
+ volatile unsigned long *d = dst;
+ unsigned long *s = src;
+ ASSERT(!((unsigned long) d & (sizeof (shadow_l1e_t) - 1)));
+#if CONFIG_PAGING_LEVELS == 3
+ /* In PAE mode, pagetable entries are larger
+ * than machine words, so won't get written atomically. We need to make
+ * sure any other cpu running on these shadows doesn't see a
+ * half-written entry. Do this by marking the entry not-present first,
+ * then writing the high word before the low word. */
+ BUILD_BUG_ON(sizeof (shadow_l1e_t) != 2 * sizeof (unsigned long));
+ d[0] = 0;
+ d[1] = s[1];
+ d[0] = s[0];
+#else
+ /* In 32-bit and 64-bit, sizeof(pte) == sizeof(ulong) == 1 word,
+ * which will be an atomic write, since the entry is aligned. */
+ BUILD_BUG_ON(sizeof (shadow_l1e_t) != sizeof (unsigned long));
+ *d = *s;
+#endif
+}
+
+
+static inline void
+shadow_write_entries(void *d, void *s, int entries, mfn_t mfn)
+/* This function does the actual writes to shadow pages.
+ * It must not be called directly, since it doesn't do the bookkeeping
+ * that shadow_set_l*e() functions do. */
+{
+ shadow_l1e_t *dst = d;
+ shadow_l1e_t *src = s;
+ void *map = NULL;
+ int i;
+
+ /* Because we mirror access rights at all levels in the shadow, an
+ * l2 (or higher) entry with the RW bit cleared will leave us with
+ * no write access through the linear map.
+ * We detect that by writing to the shadow with copy_to_user() and
+ * using map_domain_page() to get a writeable mapping if we need to. */
+ if ( __copy_to_user(d, d, sizeof (unsigned long)) != 0 )
+ {
+ perfc_incrc(shadow_linear_map_failed);
+ map = sh_map_domain_page(mfn);
+ ASSERT(map != NULL);
+ dst = map + ((unsigned long)dst & (PAGE_SIZE - 1));
+ }
+
+
+ for ( i = 0; i < entries; i++ )
+ safe_write_entry(dst++, src++);
+
+ if ( map != NULL ) sh_unmap_domain_page(map);
+
+ /* XXX TODO:
+ * Update min/max field in page_info struct of this mfn */
+}
+
+static inline int
+perms_strictly_increased(u32 old_flags, u32 new_flags)
+/* Given the flags of two entries, are the new flags a strict
+ * increase in rights over the old ones? */
+{
+ u32 of = old_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
+ u32 nf = new_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
+ /* Flip the NX bit, since it's the only one that decreases rights;
+ * we calculate as if it were an "X" bit. */
+ of ^= _PAGE_NX_BIT;
+ nf ^= _PAGE_NX_BIT;
+ /* If the changed bits are all set in the new flags, then rights strictly
+ * increased between old and new. */
+ return ((of | (of ^ nf)) == nf);
+}
+
+static int inline
+shadow_get_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
+{
+ int res;
+ mfn_t mfn;
+ struct domain *owner;
+ shadow_l1e_t sanitized_sl1e =
+ shadow_l1e_remove_flags(sl1e, _PAGE_SHADOW_RW | _PAGE_SHADOW_PRESENT);
+
+ //ASSERT(shadow_l1e_get_flags(sl1e) & _PAGE_PRESENT);
+ //ASSERT((shadow_l1e_get_flags(sl1e) & L1_DISALLOW_MASK) == 0);
+
+ if ( !shadow_mode_refcounts(d) )
+ return 1;
+
+ res = get_page_from_l1e(sanitized_sl1e, d);
+
+ // If a privileged domain is attempting to install a map of a page it does
+ // not own, we let it succeed anyway.
+ //
+ if ( unlikely(!res) &&
+ IS_PRIV(d) &&
+ !shadow_mode_translate(d) &&
+ valid_mfn(mfn = shadow_l1e_get_mfn(sl1e)) &&
+ (owner = page_get_owner(mfn_to_page(mfn))) &&
+ (d != owner) )
+ {
+ res = get_page_from_l1e(sanitized_sl1e, owner);
+ SHADOW_PRINTK("privileged domain %d installs map of mfn %05lx "
+ "which is owned by domain %d: %s\n",
+ d->domain_id, mfn_x(mfn), owner->domain_id,
+ res ? "success" : "failed");
+ }
+
+ if ( unlikely(!res) )
+ {
+ perfc_incrc(shadow_get_page_fail);
+ SHADOW_PRINTK("failed: l1e=" SH_PRI_pte "\n");
+ }
+
+ return res;
+}
+
+static void inline
+shadow_put_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
+{
+ if ( !shadow_mode_refcounts(d) )
+ return;
+
+ put_page_from_l1e(sl1e, d);
+}
+
+#if GUEST_PAGING_LEVELS >= 4
+static int shadow_set_l4e(struct vcpu *v,
+ shadow_l4e_t *sl4e,
+ shadow_l4e_t new_sl4e,
+ mfn_t sl4mfn)
+{
+ int flags = 0;
+ shadow_l4e_t old_sl4e;
+ paddr_t paddr;
+ ASSERT(sl4e != NULL);
+ old_sl4e = *sl4e;
+
+ if ( old_sl4e.l4 == new_sl4e.l4 ) return 0; /* Nothing to do */
+
+ paddr = ((((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT)
+ | (((unsigned long)sl4e) & ~PAGE_MASK));
+
+ if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT )
+ {
+ /* About to install a new reference */
+ sh_get_ref(shadow_l4e_get_mfn(new_sl4e), paddr);
+ }
+
+ /* Write the new entry */
+ shadow_write_entries(sl4e, &new_sl4e, 1, sl4mfn);
+ flags |= SHADOW_SET_CHANGED;
+
+ if ( shadow_l4e_get_flags(old_sl4e) & _PAGE_PRESENT )
+ {
+ /* We lost a reference to an old mfn. */
+ mfn_t osl3mfn = shadow_l4e_get_mfn(old_sl4e);
+ if ( (mfn_x(osl3mfn) != mfn_x(shadow_l4e_get_mfn(new_sl4e)))
+ || !perms_strictly_increased(shadow_l4e_get_flags(old_sl4e),
+ shadow_l4e_get_flags(new_sl4e)) )
+ {
+ flags |= SHADOW_SET_FLUSH;
+ }
+ sh_put_ref(v, osl3mfn, paddr);
+ }
+ return flags;
+}
+#endif /* GUEST_PAGING_LEVELS >= 4 */
+
+#if GUEST_PAGING_LEVELS >= 3
+static int shadow_set_l3e(struct vcpu *v,
+ shadow_l3e_t *sl3e,
+ shadow_l3e_t new_sl3e,
+ mfn_t sl3mfn)
+{
+ int flags = 0;
+ shadow_l3e_t old_sl3e;
+ paddr_t paddr;
+ ASSERT(sl3e != NULL);
+ old_sl3e = *sl3e;
+
+ if ( old_sl3e.l3 == new_sl3e.l3 ) return 0; /* Nothing to do */
+
+ paddr = ((((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT)
+ | (((unsigned long)sl3e) & ~PAGE_MASK));
+
+ if ( shadow_l3e_get_flags(new_sl3e) & _PAGE_PRESENT )
+ {
+ /* About to install a new reference */
+ sh_get_ref(shadow_l3e_get_mfn(new_sl3e), paddr);
+ }
+
+ /* Write the new entry */
+ shadow_write_entries(sl3e, &new_sl3e, 1, sl3mfn);
+ flags |= SHADOW_SET_CHANGED;
+
+#if GUEST_PAGING_LEVELS == 3
+ /* We wrote a guest l3e in a PAE pagetable. This table is copied in
+ * the linear pagetable entries of its l2s, and may also be copied
+ * to a low memory location to make it fit in CR3. Report that we
+ * need to resync those copies (we can't wait for the guest to flush
+ * the TLB because it might be an increase in rights). */
+ {
+ struct vcpu *vcpu;
+
+ struct pae_l3_bookkeeping *info = sl3p_to_info(sl3e);
+ for_each_vcpu(v->domain, vcpu)
+ {
+ if (info->vcpus & (1 << vcpu->vcpu_id))
+ {
+ // Remember that this flip/update needs to occur.
+ vcpu->arch.shadow.pae_flip_pending = 1;
+ flags |= SHADOW_SET_L3PAE_RECOPY;
+ }
+ }
+ }
+#endif
+
+ if ( shadow_l3e_get_flags(old_sl3e) & _PAGE_PRESENT )
+ {
+ /* We lost a reference to an old mfn. */
+ mfn_t osl2mfn = shadow_l3e_get_mfn(old_sl3e);
+ if ( (mfn_x(osl2mfn) != mfn_x(shadow_l3e_get_mfn(new_sl3e))) ||
+ !perms_strictly_increased(shadow_l3e_get_flags(old_sl3e),
+ shadow_l3e_get_flags(new_sl3e)) )
+ {
+ flags |= SHADOW_SET_FLUSH;
+ }
+ sh_put_ref(v, osl2mfn, paddr);
+ }
+ return flags;
+}
+#endif /* GUEST_PAGING_LEVELS >= 3 */
+
+static int shadow_set_l2e(struct vcpu *v,
+ shadow_l2e_t *sl2e,
+ shadow_l2e_t new_sl2e,
+ mfn_t sl2mfn)
+{
+ int flags = 0;
+ shadow_l2e_t old_sl2e;
+ paddr_t paddr;
+
+#if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
+ /* In 2-on-3 we work with pairs of l2es pointing at two-page
+ * shadows. Reference counting and up-pointers track from the first
+ * page of the shadow to the first l2e, so make sure that we're
+ * working with those:
+ * Align the pointer down so it's pointing at the first of the pair */
+ sl2e = (shadow_l2e_t *)((unsigned long)sl2e & ~(sizeof(shadow_l2e_t)));
+ /* Align the mfn of the shadow entry too */
+ new_sl2e.l2 &= ~(1<<PAGE_SHIFT);
+#endif
+
+ ASSERT(sl2e != NULL);
+ old_sl2e = *sl2e;
+
+ if ( old_sl2e.l2 == new_sl2e.l2 ) return 0; /* Nothing to do */
+
+ paddr = ((((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
+ | (((unsigned long)sl2e) & ~PAGE_MASK));
+
+ if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT )
+ {
+ /* About to install a new reference */
+ sh_get_ref(shadow_l2e_get_mfn(new_sl2e), paddr);
+ }
+
+ /* Write the new entry */
+#if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
+ {
+ shadow_l2e_t pair[2] = { new_sl2e, new_sl2e };
+ /* The l1 shadow is two pages long and need to be pointed to by
+ * two adjacent l1es. The pair have the same flags, but point
+ * at odd and even MFNs */
+ ASSERT(!(pair[0].l2 & (1<<PAGE_SHIFT)));
+ pair[1].l2 |= (1<<PAGE_SHIFT);
+ shadow_write_entries(sl2e, &pair, 2, sl2mfn);
+ }
+#else /* normal case */
+ shadow_write_entries(sl2e, &new_sl2e, 1, sl2mfn);
+#endif
+ flags |= SHADOW_SET_CHANGED;
+
+ if ( shadow_l2e_get_flags(old_sl2e) & _PAGE_PRESENT )
+ {
+ /* We lost a reference to an old mfn. */
+ mfn_t osl1mfn = shadow_l2e_get_mfn(old_sl2e);
+ if ( (mfn_x(osl1mfn) != mfn_x(shadow_l2e_get_mfn(new_sl2e))) ||
+ !perms_strictly_increased(shadow_l2e_get_flags(old_sl2e),
+ shadow_l2e_get_flags(new_sl2e)) )
+ {
+ flags |= SHADOW_SET_FLUSH;
+ }
+ sh_put_ref(v, osl1mfn, paddr);
+ }
+ return flags;
+}
+
+static int shadow_set_l1e(struct vcpu *v,
+ shadow_l1e_t *sl1e,
+ shadow_l1e_t new_sl1e,
+ mfn_t sl1mfn)
+{
+ int flags = 0;
+ struct domain *d = v->domain;
+ shadow_l1e_t old_sl1e;
+ ASSERT(sl1e != NULL);
+
+ old_sl1e = *sl1e;
+
+ if ( old_sl1e.l1 == new_sl1e.l1 ) return 0; /* Nothing to do */
+
+ if ( shadow_l1e_get_flags(new_sl1e) & _PAGE_PRESENT )
+ {
+ /* About to install a new reference */
+ if ( shadow_mode_refcounts(d) ) {
+ if ( shadow_get_page_from_l1e(new_sl1e, d) == 0 )
+ {
+ /* Doesn't look like a pagetable. */
+ flags |= SHADOW_SET_ERROR;
+ new_sl1e = shadow_l1e_empty();
+ }
+ }
+ }
+
+ /* Write the new entry */
+ shadow_write_entries(sl1e, &new_sl1e, 1, sl1mfn);
+ flags |= SHADOW_SET_CHANGED;
+
+ if ( shadow_l1e_get_flags(old_sl1e) & _PAGE_PRESENT )
+ {
+ /* We lost a reference to an old mfn. */
+ /* N.B. Unlike higher-level sets, never need an extra flush
+ * when writing an l1e. Because it points to the same guest frame
+ * as the guest l1e did, it's the guest's responsibility to
+ * trigger a flush later. */
+ if ( shadow_mode_refcounts(d) )
+ {
+ shadow_put_page_from_l1e(old_sl1e, d);
+ }
+ }
+ return flags;
+}
+
+
+/**************************************************************************/
+/* These functions take a vcpu and a virtual address, and return a pointer
+ * to the appropriate level N entry from the shadow tables.
+ * If the necessary tables are not present in the shadow, they return NULL. */
+
+/* N.B. The use of GUEST_PAGING_LEVELS here is correct. If the shadow has
+ * more levels than the guest, the upper levels are always fixed and do not
+ * reflect any information from the guest, so we do not use these functions
+ * to access them. */
+
+#if GUEST_PAGING_LEVELS >= 4
+static shadow_l4e_t *
+shadow_get_l4e(struct vcpu *v, unsigned long va)
+{
+ /* Reading the top level table is always valid. */
+ return sh_linear_l4_table(v) + shadow_l4_linear_offset(va);
+}
+#endif /* GUEST_PAGING_LEVELS >= 4 */
+
+
+#if GUEST_PAGING_LEVELS >= 3
+static shadow_l3e_t *
+shadow_get_l3e(struct vcpu *v, unsigned long va)
+{
+#if GUEST_PAGING_LEVELS >= 4 /* 64bit... */
+ /* Get the l4 */
+ shadow_l4e_t *sl4e = shadow_get_l4e(v, va);
+ ASSERT(sl4e != NULL);
+ if ( !(shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT) )
+ return NULL;
+ ASSERT(valid_mfn(shadow_l4e_get_mfn(*sl4e)));
+ /* l4 was present; OK to get the l3 */
+ return sh_linear_l3_table(v) + shadow_l3_linear_offset(va);
+#else /* PAE... */
+ /* Top level is always mapped */
+ ASSERT(v->arch.shadow_vtable);
+ return ((shadow_l3e_t *)v->arch.shadow_vtable) + shadow_l3_linear_offset(va);
+#endif
+}
+#endif /* GUEST_PAGING_LEVELS >= 3 */
+
+
+static shadow_l2e_t *
+shadow_get_l2e(struct vcpu *v, unsigned long va)
+{
+#if GUEST_PAGING_LEVELS >= 3 /* 64bit/PAE... */
+ /* Get the l3 */
+ shadow_l3e_t *sl3e = shadow_get_l3e(v, va);
+ if ( sl3e == NULL || !(shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT) )
+ return NULL;
+ ASSERT(valid_mfn(shadow_l3e_get_mfn(*sl3e)));
+ /* l3 was present; OK to get the l2 */
+#endif
+ return sh_linear_l2_table(v) + shadow_l2_linear_offset(va);
+}
+
+
+#if 0 // avoid the compiler warning for now...
+
+static shadow_l1e_t *
+shadow_get_l1e(struct vcpu *v, unsigned long va)
+{
+ /* Get the l2 */
+ shadow_l2e_t *sl2e = shadow_get_l2e(v, va);
+ if ( sl2e == NULL || !(shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT) )
+ return NULL;
+ ASSERT(valid_mfn(shadow_l2e_get_mfn(*sl2e)));
+ /* l2 was present; OK to get the l1 */
+ return sh_linear_l1_table(v) + shadow_l1_linear_offset(va);
+}
+
+#endif
+
+
+/**************************************************************************/
+/* Macros to walk pagetables. These take the shadow of a pagetable and
+ * walk every "interesting" entry. That is, they don't touch Xen mappings,
+ * and for 32-bit l2s shadowed onto PAE or 64-bit, they only touch every
+ * second entry (since pairs of entries are managed together). For multi-page
+ * shadows they walk all pages.
+ *
+ * Arguments are an MFN, the variable to point to each entry, a variable
+ * to indicate that we are done (we will shortcut to the end of the scan
+ * when _done != 0), a variable to indicate that we should avoid Xen mappings,
+ * and the code.
+ *
+ * WARNING: These macros have side-effects. They change the values of both
+ * the pointer and the MFN. */
+
+static inline void increment_ptr_to_guest_entry(void *ptr)
+{
+ if ( ptr )
+ {
+ guest_l1e_t **entry = ptr;
+ (*entry)++;
+ }
+}
+
+/* All kinds of l1: touch all entries */
+#define _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
+do { \
+ int _i; \
+ shadow_l1e_t *_sp = map_shadow_page((_sl1mfn)); \
+ ASSERT((mfn_to_page(_sl1mfn)->count_info & PGC_SH_type_mask) \
+ == PGC_SH_l1_shadow \
+ || (mfn_to_page(_sl1mfn)->count_info & PGC_SH_type_mask) \
+ == PGC_SH_fl1_shadow); \
+ for ( _i = 0; _i < SHADOW_L1_PAGETABLE_ENTRIES; _i++ ) \
+ { \
+ (_sl1e) = _sp + _i; \
+ if ( shadow_l1e_get_flags(*(_sl1e)) & _PAGE_PRESENT ) \
+ {_code} \
+ if ( _done ) break; \
+ increment_ptr_to_guest_entry(_gl1p); \
+ } \
+ unmap_shadow_page(_sp); \
+} while (0)
+
+/* 32-bit l1, on PAE or 64-bit shadows: need to walk both pages of shadow */
+#if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
+#define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
+do { \
+ int __done = 0; \
+ _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \
+ ({ (__done = _done); }), _code); \
+ _sl1mfn = _mfn(mfn_x(_sl1mfn) + 1); \
+ if ( !__done ) \
+ _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \
+ ({ (__done = _done); }), _code); \
+} while (0)
+#else /* Everything else; l1 shadows are only one page */
+#define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
+ _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code)
+#endif
+
+
+#if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
+
+/* 32-bit l2 on PAE/64: four pages, touch every second entry, and avoid Xen */
+#define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code) \
+do { \
+ int _i, _j, __done = 0; \
+ ASSERT((mfn_to_page(_sl2mfn)->count_info & PGC_SH_type_mask) \
+ == PGC_SH_l2_32_shadow); \
+ for ( _j = 0; _j < 4 && !__done; _j++ ) \
+ { \
+ shadow_l2e_t *_sp = map_shadow_page(_sl2mfn); \
+ for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i += 2 ) \
+ if ( (!(_xen)) \
+ || ((_j * SHADOW_L2_PAGETABLE_ENTRIES) + _i) \
+ < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT) ) \
+ { \
+ (_sl2e) = _sp + _i; \
+ if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
+ {_code} \
+ if ( (__done = (_done)) ) break; \
+ increment_ptr_to_guest_entry(_gl2p); \
+ } \
+ unmap_shadow_page(_sp); \
+ _sl2mfn = _mfn(mfn_x(_sl2mfn) + 1); \
+ } \
+} while (0)
+
+#elif GUEST_PAGING_LEVELS == 2
+
+/* 32-bit on 32-bit: avoid Xen entries */
+#define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code) \
+do { \
+ int _i; \
+ shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \
+ ASSERT((mfn_to_page(_sl2mfn)->count_info & PGC_SH_type_mask) \
+ == PGC_SH_l2_32_shadow); \
+ for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
+ if ( (!(_xen)) \
+ || \
+ (_i < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \
+ { \
+ (_sl2e) = _sp + _i; \
+ if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
+ {_code} \
+ if ( _done ) break; \
+ increment_ptr_to_guest_entry(_gl2p); \
+ } \
+ unmap_shadow_page(_sp); \
+} while (0)
+
+#elif GUEST_PAGING_LEVELS == 3
+
+/* PAE: if it's an l2h, don't touch Xen mappings */
+#define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code) \
+do { \
+ int _i; \
+ shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \
+ ASSERT((mfn_to_page(_sl2mfn)->count_info & PGC_SH_type_mask) \
+ == PGC_SH_l2_pae_shadow \
+ || (mfn_to_page(_sl2mfn)->count_info & PGC_SH_type_mask) \
+ == PGC_SH_l2h_pae_shadow); \
+ for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
+ if ( (!(_xen)) \
+ || ((mfn_to_page(_sl2mfn)->count_info & PGC_SH_type_mask) \
+ != PGC_SH_l2h_pae_shadow) \
+ || ((_i + (3 * SHADOW_L2_PAGETABLE_ENTRIES)) \
+ < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \
+ { \
+ (_sl2e) = _sp + _i; \
+ if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
+ {_code} \
+ if ( _done ) break; \
+ increment_ptr_to_guest_entry(_gl2p); \
+ } \
+ unmap_shadow_page(_sp); \
+} while (0)
+
+#else
+
+/* 64-bit l2: touch all entries */
+#define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code) \
+do { \
+ int _i; \
+ shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \
+ ASSERT((mfn_to_page(_sl2mfn)->count_info & PGC_SH_type_mask) \
+ == PGC_SH_l2_64_shadow); \
+ for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
+ { \
+ (_sl2e) = _sp + _i; \
+ if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
+ {_code} \
+ if ( _done ) break; \
+ increment_ptr_to_guest_entry(_gl2p); \
+ } \
+ unmap_shadow_page(_sp); \
+} while (0)
+
+#endif /* different kinds of l2 */
+
+#if GUEST_PAGING_LEVELS == 3
+
+/* PAE l3 subshadow: touch all entries (FOREACH_L2E will find Xen l2es). */
+#define SHADOW_FOREACH_L3E_SUB(_sl3e, _gl3p, _done, _code) \
+do { \
+ int _i; \
+ for ( _i = 0; _i < 4; _i++ ) \
+ { \
+ if ( shadow_l3e_get_flags(*(_sl3e)) & _PAGE_PRESENT ) \
+ {_code} \
+ if ( _done ) break; \
+ _sl3e++; \
+ increment_ptr_to_guest_entry(_gl3p); \
+ } \
+} while (0)
+
+/* PAE l3 full shadow: call subshadow walk on all valid l3 subshadows */
+#define SHADOW_FOREACH_L3E(_sl3mfn, _sl3e, _gl3p, _done, _code) \
+do { \
+ int _i, _j, _k, __done = 0; \
+ ASSERT((mfn_to_page(_sl3mfn)->count_info & PGC_SH_type_mask) \
+ == PGC_SH_l3_pae_shadow); \
+ /* The subshadows are split, 64 on each page of the shadow */ \
+ for ( _j = 0; _j < 2 && !__done; _j++ ) \
+ { \
+ void *_sp = sh_map_domain_page(_sl3mfn); \
+ for ( _i = 0; _i < 64; _i++ ) \
+ { \
+ /* Every second 32-byte region is a bookkeeping entry */ \
+ _sl3e = (shadow_l3e_t *)(_sp + (64 * _i)); \
+ if ( (sl3p_to_info(_sl3e))->refcount > 0 ) \
+ SHADOW_FOREACH_L3E_SUB(_sl3e, _gl3p, \
+ ({ __done = (_done); __done; }), \
+ _code); \
+ else \
+ for ( _k = 0 ; _k < 4 ; _k++ ) \
+ increment_ptr_to_guest_entry(_gl3p); \
+ if ( __done ) break; \
+ } \
+ sh_unmap_domain_page(_sp); \
+ _sl3mfn = _mfn(mfn_x(_sl3mfn) + 1); \
+ } \
+} while (0)
+
+#elif GUEST_PAGING_LEVELS == 4
+
+/* 64-bit l3: touch all entries */
+#define SHADOW_FOREACH_L3E(_sl3mfn, _sl3e, _gl3p, _done, _code) \
+do { \
+ int _i; \
+ shadow_l3e_t *_sp = map_shadow_page((_sl3mfn)); \
+ ASSERT((mfn_to_page(_sl3mfn)->count_info & PGC_SH_type_mask) \
+ == PGC_SH_l3_64_shadow); \
+ for ( _i = 0; _i < SHADOW_L3_PAGETABLE_ENTRIES; _i++ ) \
+ { \
+ (_sl3e) = _sp + _i; \
+ if ( shadow_l3e_get_flags(*(_sl3e)) & _PAGE_PRESENT ) \
+ {_code} \
+ if ( _done ) break; \
+ increment_ptr_to_guest_entry(_gl3p); \
+ } \
+ unmap_shadow_page(_sp); \
+} while (0)
+
+/* 64-bit l4: avoid Xen mappings */
+#define SHADOW_FOREACH_L4E(_sl4mfn, _sl4e, _gl4p, _done, _xen, _code) \
+do { \
+ int _i; \
+ shadow_l4e_t *_sp = map_shadow_page((_sl4mfn)); \
+ ASSERT((mfn_to_page(_sl4mfn)->count_info & PGC_SH_type_mask) \
+ == PGC_SH_l4_64_shadow); \
+ for ( _i = 0; _i < SHADOW_L4_PAGETABLE_ENTRIES; _i++ ) \
+ { \
+ if ( (!(_xen)) || is_guest_l4_slot(_i) ) \
+ { \
+ (_sl4e) = _sp + _i; \
+ if ( shadow_l4e_get_flags(*(_sl4e)) & _PAGE_PRESENT ) \
+ {_code} \
+ if ( _done ) break; \
+ } \
+ increment_ptr_to_guest_entry(_gl4p); \
+ } \
+ unmap_shadow_page(_sp); \
+} while (0)
+
+#endif
+
+
+
+/**************************************************************************/
+/* Functions to install Xen mappings and linear mappings in shadow pages */
+
+static mfn_t sh_make_shadow(struct vcpu *v, mfn_t gmfn, u32 shadow_type);
+
+// XXX -- this function should probably be moved to shadow-common.c, but that
+// probably wants to wait until the shadow types have been moved from
+// shadow-types.h to shadow-private.h
+//
+#if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
+void sh_install_xen_entries_in_l4(struct vcpu *v, mfn_t gl4mfn, mfn_t sl4mfn)
+{
+ struct domain *d = v->domain;
+ shadow_l4e_t *sl4e;
+
+ sl4e = sh_map_domain_page(sl4mfn);
+ ASSERT(sl4e != NULL);
+ ASSERT(sizeof (l4_pgentry_t) == sizeof (shadow_l4e_t));
+
+ /* Copy the common Xen mappings from the idle domain */
+ memcpy(&sl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
+ &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
+ ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
+
+ /* Install the per-domain mappings for this domain */
+ sl4e[shadow_l4_table_offset(PERDOMAIN_VIRT_START)] =
+ shadow_l4e_from_mfn(page_to_mfn(virt_to_page(d->arch.mm_perdomain_l3)),
+ __PAGE_HYPERVISOR);
+
+ /* Linear mapping */
+ sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
+ shadow_l4e_from_mfn(gl4mfn, __PAGE_HYPERVISOR);
+ sl4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
+ shadow_l4e_from_mfn(sl4mfn, __PAGE_HYPERVISOR);
+
+ if ( shadow_mode_translate(v->domain) )
+ {
+ /* install domain-specific P2M table */
+ sl4e[shadow_l4_table_offset(RO_MPT_VIRT_START)] =
+ shadow_l4e_from_mfn(pagetable_get_mfn(d->arch.phys_table),
+ __PAGE_HYPERVISOR);
+ }
+
+ sh_unmap_domain_page(sl4e);
+}
+#endif
+
+#if CONFIG_PAGING_LEVELS == 3 && GUEST_PAGING_LEVELS == 3
+// For 3-on-3 PV guests, we need to make sure the xen mappings are in
+// place, which means that we need to populate the l2h entry in the l3
+// table.
+
+void sh_install_xen_entries_in_l2h(struct vcpu *v,
+ mfn_t sl2hmfn)
+{
+ struct domain *d = v->domain;
+ shadow_l2e_t *sl2e;
+ int i;
+
+ sl2e = sh_map_domain_page(sl2hmfn);
+ ASSERT(sl2e != NULL);
+ ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t));
+
+ /* Copy the common Xen mappings from the idle domain */
+ memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
+ &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
+ L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
+
+ /* Install the per-domain mappings for this domain */
+ for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
+ sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] =
+ shadow_l2e_from_mfn(
+ page_to_mfn(virt_to_page(d->arch.mm_perdomain_pt) + i),
+ __PAGE_HYPERVISOR);
+
+ /* We don't set up a linear mapping here because we can't until this
+ * l2h is installed in an l3e. sh_update_linear_entries() handles
+ * the linear mappings when the l3 is loaded. */
+
+ if ( shadow_mode_translate(d) )
+ {
+ /* Install the domain-specific p2m table */
+ l3_pgentry_t *p2m;
+ ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0);
+ p2m = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
+ for ( i = 0; i < MACHPHYS_MBYTES>>1; i++ )
+ {
+ sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START) + i] =
+ shadow_l2e_from_mfn(_mfn(l3e_get_pfn(p2m[i])),
+ __PAGE_HYPERVISOR);
+ }
+ sh_unmap_domain_page(p2m);
+ }
+
+ sh_unmap_domain_page(sl2e);
+}
+
+void sh_install_xen_entries_in_l3(struct vcpu *v, mfn_t gl3mfn, mfn_t sl3mfn)
+{
+ shadow_l3e_t *sl3e;
+ guest_l3e_t *gl3e = v->arch.guest_vtable;
+ shadow_l3e_t new_sl3e;
+ gfn_t l2gfn;
+ mfn_t l2gmfn, l2smfn;
+ int r;
+
+ ASSERT(!shadow_mode_external(v->domain));
+ ASSERT(guest_l3e_get_flags(gl3e[3]) & _PAGE_PRESENT);
+ l2gfn = guest_l3e_get_gfn(gl3e[3]);
+ l2gmfn = sh_gfn_to_mfn(v->domain, gfn_x(l2gfn));
+ l2smfn = get_shadow_status(v, l2gmfn, PGC_SH_l2h_shadow);
+ if ( !valid_mfn(l2smfn) )
+ {
+ l2smfn = sh_make_shadow(v, l2gmfn, PGC_SH_l2h_shadow);
+ }
+ l3e_propagate_from_guest(v, &gl3e[3], gl3mfn, l2smfn, &new_sl3e,
+ ft_prefetch);
+ sl3e = sh_map_domain_page(sl3mfn);
+ r = shadow_set_l3e(v, &sl3e[3], new_sl3e, sl3mfn);
+ sh_unmap_domain_page(sl3e);
+}
+#endif
+
+
+#if CONFIG_PAGING_LEVELS == 2 && GUEST_PAGING_LEVELS == 2
+void sh_install_xen_entries_in_l2(struct vcpu *v, mfn_t gl2mfn, mfn_t sl2mfn)
+{
+ struct domain *d = v->domain;
+ shadow_l2e_t *sl2e;
+ int i;
+
+ sl2e = sh_map_domain_page(sl2mfn);
+ ASSERT(sl2e != NULL);
+ ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t));
+
+ /* Copy the common Xen mappings from the idle domain */
+ memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT],
+ &idle_pg_table[L2_PAGETABLE_FIRST_XEN_SLOT],
+ L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
+
+ /* Install the per-domain mappings for this domain */
+ for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
+ sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] =
+ shadow_l2e_from_mfn(
+ page_to_mfn(virt_to_page(d->arch.mm_perdomain_pt) + i),
+ __PAGE_HYPERVISOR);
+
+ /* Linear mapping */
+ sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START)] =
+ shadow_l2e_from_mfn(gl2mfn, __PAGE_HYPERVISOR);
+ sl2e[shadow_l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
+ shadow_l2e_from_mfn(sl2mfn, __PAGE_HYPERVISOR);
+
+ if ( shadow_mode_translate(d) )
+ {
+ /* install domain-specific P2M table */
+ sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START)] =
+ shadow_l2e_from_mfn(pagetable_get_mfn(d->arch.phys_table),
+ __PAGE_HYPERVISOR);
+ }
+
+ sh_unmap_domain_page(sl2e);
+}
+#endif
+
+
+
+
+
+/**************************************************************************/
+/* Create a shadow of a given guest page.
+ */
+static mfn_t
+sh_make_shadow(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
+{
+ mfn_t smfn = shadow_alloc(v->domain, shadow_type, mfn_x(gmfn));
+ SHADOW_DEBUG(MAKE_SHADOW, "(%05lx, %u)=>%05lx\n",
+ mfn_x(gmfn), shadow_type, mfn_x(smfn));
+
+ if ( shadow_type != PGC_SH_guest_root_type )
+ /* Lower-level shadow, not yet linked form a higher level */
+ mfn_to_page(smfn)->up = 0;
+
+ // Create the Xen mappings...
+ if ( !shadow_mode_external(v->domain) )
+ {
+ switch (shadow_type)
+ {
+#if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
+ case PGC_SH_l4_shadow:
+ sh_install_xen_entries_in_l4(v, gmfn, smfn); break;
+#endif
+#if CONFIG_PAGING_LEVELS == 3 && GUEST_PAGING_LEVELS == 3
+ case PGC_SH_l3_shadow:
+ sh_install_xen_entries_in_l3(v, gmfn, smfn); break;
+ case PGC_SH_l2h_shadow:
+ sh_install_xen_entries_in_l2h(v, smfn); break;
+#endif
+#if CONFIG_PAGING_LEVELS == 2 && GUEST_PAGING_LEVELS == 2
+ case PGC_SH_l2_shadow:
+ sh_install_xen_entries_in_l2(v, gmfn, smfn); break;
+#endif
+ default: /* Do nothing */ break;
+ }
+ }
+
+ shadow_promote(v, gmfn, shadow_type);
+ set_shadow_status(v, gmfn, shadow_type, smfn);
+
+ return smfn;
+}
+
+/* Make a splintered superpage shadow */
+static mfn_t
+make_fl1_shadow(struct vcpu *v, gfn_t gfn)
+{
+ mfn_t smfn = shadow_alloc(v->domain, PGC_SH_fl1_shadow,
+ (unsigned long) gfn_x(gfn));
+
+ SHADOW_DEBUG(MAKE_SHADOW, "(%" SH_PRI_gfn ")=>%" SH_PRI_mfn "\n",
+ gfn_x(gfn), mfn_x(smfn));
+
+ set_fl1_shadow_status(v, gfn, smfn);
+ return smfn;
+}
+
+
+#if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
+mfn_t
+sh_make_monitor_table(struct vcpu *v)
+{
+
+ ASSERT(pagetable_get_pfn(v->arch.monitor_table) == 0);
+
+#if CONFIG_PAGING_LEVELS == 4
+ {
+ struct domain *d = v->domain;
+ mfn_t m4mfn;
+ m4mfn = shadow_alloc(d, PGC_SH_monitor_table, 0);
+ sh_install_xen_entries_in_l4(v, m4mfn, m4mfn);
+ /* Remember the level of this table */
+ mfn_to_page(m4mfn)->shadow_flags = 4;
+#if SHADOW_PAGING_LEVELS < 4
+ // Install a monitor l3 table in slot 0 of the l4 table.
+ // This is used for shadow linear maps.
+ {
+ mfn_t m3mfn;
+ l4_pgentry_t *l4e;
+ m3mfn = shadow_alloc(d, PGC_SH_monitor_table, 0);
+ mfn_to_page(m3mfn)->shadow_flags = 3;
+ l4e = sh_map_domain_page(m4mfn);
+ l4e[0] = l4e_from_pfn(mfn_x(m3mfn), __PAGE_HYPERVISOR);
+ sh_unmap_domain_page(l4e);
+ }
+#endif /* SHADOW_PAGING_LEVELS < 4 */
+ return m4mfn;
+ }
+
+#elif CONFIG_PAGING_LEVELS == 3
+
+ {
+ struct domain *d = v->domain;
+ mfn_t m3mfn, m2mfn;
+ l3_pgentry_t *l3e;
+ l2_pgentry_t *l2e;
+ int i;
+
+ m3mfn = shadow_alloc(d, PGC_SH_monitor_table, 0);
+ /* Remember the level of this table */
+ mfn_to_page(m3mfn)->shadow_flags = 3;
+
+ // Install a monitor l2 table in slot 3 of the l3 table.
+ // This is used for all Xen entries, including linear maps
+ m2mfn = shadow_alloc(d, PGC_SH_monitor_table, 0);
+ mfn_to_page(m2mfn)->shadow_flags = 2;
+ l3e = sh_map_domain_page(m3mfn);
+ l3e[3] = l3e_from_pfn(mfn_x(m2mfn), _PAGE_PRESENT);
+ sh_install_xen_entries_in_l2h(v, m2mfn);
+ /* Install the monitor's own linear map */
+ l2e = sh_map_domain_page(m2mfn);
+ for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
+ l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] =
+ (l3e_get_flags(l3e[i]) & _PAGE_PRESENT)
+ ? l2e_from_pfn(l3e_get_pfn(l3e[i]), __PAGE_HYPERVISOR)
+ : l2e_empty();
+ sh_unmap_domain_page(l2e);
+ sh_unmap_domain_page(l3e);
+
+ SHADOW_PRINTK("new monitor table: %#lx\n", mfn_x(m3mfn));
+ return m3mfn;
+ }
+
+#elif CONFIG_PAGING_LEVELS == 2
+
+ {
+ struct domain *d = v->domain;
+ mfn_t m2mfn;
+ m2mfn = shadow_alloc(d, PGC_SH_monitor_table, 0);
+ sh_install_xen_entries_in_l2(v, m2mfn, m2mfn);
+ /* Remember the level of this table */
+ mfn_to_page(m2mfn)->shadow_flags = 2;
+ return m2mfn;
+ }
+
+#else
+#error this should not happen
+#endif /* CONFIG_PAGING_LEVELS */
+}
+#endif /* SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS */
+
+/**************************************************************************/
+/* These functions also take a virtual address and return the level-N
+ * shadow table mfn and entry, but they create the shadow pagetables if
+ * they are needed. The "demand" argument is non-zero when handling
+ * a demand fault (so we know what to do about accessed bits &c).
+ * If the necessary tables are not present in the guest, they return NULL. */
+#if GUEST_PAGING_LEVELS >= 4
+static shadow_l4e_t * shadow_get_and_create_l4e(struct vcpu *v,
+ walk_t *gw,
+ mfn_t *sl4mfn)
+{
+ /* There is always a shadow of the top level table. Get it. */
+ *sl4mfn = pagetable_get_mfn(v->arch.shadow_table);
+ /* Reading the top level table is always valid. */
+ return sh_linear_l4_table(v) + shadow_l4_linear_offset(gw->va);
+}
+#endif /* GUEST_PAGING_LEVELS >= 4 */
+
+
+#if GUEST_PAGING_LEVELS >= 3
+static shadow_l3e_t * shadow_get_and_create_l3e(struct vcpu *v,
+ walk_t *gw,
+ mfn_t *sl3mfn,
+ fetch_type_t ft)
+{
+#if GUEST_PAGING_LEVELS >= 4 /* 64bit... */
+ mfn_t sl4mfn;
+ shadow_l4e_t *sl4e;
+ if ( !valid_mfn(gw->l3mfn) ) return NULL; /* No guest page. */
+ /* Get the l4e */
+ sl4e = shadow_get_and_create_l4e(v, gw, &sl4mfn);
+ ASSERT(sl4e != NULL);
+ if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT )
+ {
+ *sl3mfn = shadow_l4e_get_mfn(*sl4e);
+ ASSERT(valid_mfn(*sl3mfn));
+ }
+ else
+ {
+ int r;
+ shadow_l4e_t new_sl4e;
+ /* No l3 shadow installed: find and install it. */
+ *sl3mfn = get_shadow_status(v, gw->l3mfn, PGC_SH_l3_shadow);
+ if ( !valid_mfn(*sl3mfn) )
+ {
+ /* No l3 shadow of this page exists at all: make one. */
+ *sl3mfn = sh_make_shadow(v, gw->l3mfn, PGC_SH_l3_shadow);
+ }
+ /* Install the new sl3 table in the sl4e */
+ l4e_propagate_from_guest(v, gw->l4e, gw->l4mfn,
+ *sl3mfn, &new_sl4e, ft);
+ r = shadow_set_l4e(v, sl4e, new_sl4e, sl4mfn);
+ ASSERT((r & SHADOW_SET_FLUSH) == 0);
+ }
+ /* Now follow it down a level. Guaranteed to succeed. */
+ return sh_linear_l3_table(v) + shadow_l3_linear_offset(gw->va);
+#else /* PAE... */
+ /* There is always a shadow of the top level table. Get it. */
+ *sl3mfn = pagetable_get_mfn(v->arch.shadow_table);
+ /* This next line is important: the shadow l3 table is in an 8k
+ * shadow and we need to return the right mfn of the pair. This call
+ * will set it for us as a side-effect. */
+ (void) shadow_l3_index(sl3mfn, guest_index(gw->l3e));
+ ASSERT(v->arch.shadow_vtable);
+ return ((shadow_l3e_t *)v->arch.shadow_vtable)
+ + shadow_l3_table_offset(gw->va);
+#endif /* GUEST_PAGING_LEVELS >= 4 */
+}
+#endif /* GUEST_PAGING_LEVELS >= 3 */
+
+
+static shadow_l2e_t * shadow_get_and_create_l2e(struct vcpu *v,
+ walk_t *gw,
+ mfn_t *sl2mfn,
+ fetch_type_t ft)
+{
+#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64bit... */
+ mfn_t sl3mfn = _mfn(INVALID_MFN);
+ shadow_l3e_t *sl3e;
+ if ( !valid_mfn(gw->l2mfn) ) return NULL; /* No guest page. */
+ /* Get the l3e */
+ sl3e = shadow_get_and_create_l3e(v, gw, &sl3mfn, ft);
+ ASSERT(sl3e != NULL); /* Since we know guest PT is valid this far */
+ if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT )
+ {
+ *sl2mfn = shadow_l3e_get_mfn(*sl3e);
+ ASSERT(valid_mfn(*sl2mfn));
+ }
+ else
+ {
+ int r;
+ shadow_l3e_t new_sl3e;
+ /* No l2 shadow installed: find and install it. */
+ *sl2mfn = get_shadow_status(v, gw->l2mfn, PGC_SH_l2_shadow);
+ if ( !valid_mfn(*sl2mfn) )
+ {
+ /* No l2 shadow of this page exists at all: make one. */
+ *sl2mfn = sh_make_shadow(v, gw->l2mfn, PGC_SH_l2_shadow);
+ }
+ /* Install the new sl2 table in the sl3e */
+ l3e_propagate_from_guest(v, gw->l3e, gw->l3mfn,
+ *sl2mfn, &new_sl3e, ft);
+ r = shadow_set_l3e(v, sl3e, new_sl3e, sl3mfn);
+ ASSERT((r & SHADOW_SET_FLUSH) == 0);
+#if GUEST_PAGING_LEVELS == 3
+ /* Need to sync up the linear maps, as we are about to use them */
+ ASSERT( r & SHADOW_SET_L3PAE_RECOPY );
+ sh_pae_recopy(v->domain);
+#endif
+ }
+ /* Now follow it down a level. Guaranteed to succeed. */
+ return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
+#else /* 32bit... */
+ /* There is always a shadow of the top level table. Get it. */
+ *sl2mfn = pagetable_get_mfn(v->arch.shadow_table);
+ /* This next line is important: the guest l2 has a 16k
+ * shadow, we need to return the right mfn of the four. This
+ * call will set it for us as a side-effect. */
+ (void) shadow_l2_index(sl2mfn, guest_index(gw->l2e));
+ /* Reading the top level table is always valid. */
+ return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
+#endif
+}
+
+
+static shadow_l1e_t * shadow_get_and_create_l1e(struct vcpu *v,
+ walk_t *gw,
+ mfn_t *sl1mfn,
+ fetch_type_t ft)
+{
+ mfn_t sl2mfn;
+ shadow_l2e_t *sl2e;
+
+ /* Get the l2e */
+ sl2e = shadow_get_and_create_l2e(v, gw, &sl2mfn, ft);
+ if ( sl2e == NULL ) return NULL;
+ if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT )
+ {
+ *sl1mfn = shadow_l2e_get_mfn(*sl2e);
+ ASSERT(valid_mfn(*sl1mfn));
+ }
+ else
+ {
+ shadow_l2e_t new_sl2e;
+ int r, flags = guest_l2e_get_flags(*gw->l2e);
+ /* No l1 shadow installed: find and install it. */
+ if ( !(flags & _PAGE_PRESENT) )
+ return NULL; /* No guest page. */
+ if ( guest_supports_superpages(v) && (flags & _PAGE_PSE) )
+ {
+ /* Splintering a superpage */
+ gfn_t l2gfn = guest_l2e_get_gfn(*gw->l2e);
+ *sl1mfn = get_fl1_shadow_status(v, l2gfn);
+ if ( !valid_mfn(*sl1mfn) )
+ {
+ /* No fl1 shadow of this superpage exists at all: make one. */
+ *sl1mfn = make_fl1_shadow(v, l2gfn);
+ }
+ }
+ else
+ {
+ /* Shadowing an actual guest l1 table */
+ if ( !valid_mfn(gw->l2mfn) ) return NULL; /* No guest page. */
+ *sl1mfn = get_shadow_status(v, gw->l1mfn, PGC_SH_l1_shadow);
+ if ( !valid_mfn(*sl1mfn) )
+ {
+ /* No l1 shadow of this page exists at all: make one. */
+ *sl1mfn = sh_make_shadow(v, gw->l1mfn, PGC_SH_l1_shadow);
+ }
+ }
+ /* Install the new sl1 table in the sl2e */
+ l2e_propagate_from_guest(v, gw->l2e, gw->l2mfn,
+ *sl1mfn, &new_sl2e, ft);
+ r = shadow_set_l2e(v, sl2e, new_sl2e, sl2mfn);
+ ASSERT((r & SHADOW_SET_FLUSH) == 0);
+ /* This next line is important: in 32-on-PAE and 32-on-64 modes,
+ * the guest l1 table has an 8k shadow, and we need to return
+ * the right mfn of the pair. This call will set it for us as a
+ * side-effect. (In all other cases, it's a no-op and will be
+ * compiled out.) */
+ (void) shadow_l1_index(sl1mfn, guest_l1_table_offset(gw->va));
+ }
+ /* Now follow it down a level. Guaranteed to succeed. */
+ return sh_linear_l1_table(v) + shadow_l1_linear_offset(gw->va);
+}
+
+
+
+/**************************************************************************/
+/* Destructors for shadow tables:
+ * Unregister the shadow, decrement refcounts of any entries present in it,
+ * and release the memory.
+ *
+ * N.B. These destructors do not clear the contents of the shadows.
+ * This allows us to delay TLB shootdowns until the page is being reused.
+ * See shadow_alloc() and shadow_free() for how this is handled.
+ */
+
+#if GUEST_PAGING_LEVELS >= 4
+void sh_destroy_l4_shadow(struct vcpu *v, mfn_t smfn)
+{
+ shadow_l4e_t *sl4e;
+ u32 t = mfn_to_page(smfn)->count_info & PGC_SH_type_mask;
+ mfn_t gmfn, sl4mfn;
+ int xen_mappings;
+
+ SHADOW_DEBUG(DESTROY_SHADOW,
+ "%s(%05lx)\n", __func__, mfn_x(smfn));
+ ASSERT(t == PGC_SH_l4_shadow);
+
+ /* Record that the guest page isn't shadowed any more (in this type) */
+ gmfn = _mfn(mfn_to_page(smfn)->u.inuse.type_info);
+ delete_shadow_status(v, gmfn, t, smfn);
+ shadow_demote(v, gmfn, t);
+ /* Take this shadow off the list of root shadows */
+ list_del_init(&mfn_to_page(smfn)->list);
+
+ /* Decrement refcounts of all the old entries */
+ xen_mappings = (!shadow_mode_external(v->domain));
+ sl4mfn = smfn;
+ SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, xen_mappings, {
+ if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT )
+ {
+ sh_put_ref(v, shadow_l4e_get_mfn(*sl4e),
+ (((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT)
+ | ((unsigned long)sl4e & ~PAGE_MASK));
+ }
+ });
+
+ /* Put the memory back in the pool */
+ shadow_free(v->domain, smfn);
+}
+#endif
+
+#if GUEST_PAGING_LEVELS >= 3
+void sh_destroy_l3_shadow(struct vcpu *v, mfn_t smfn)
+{
+ shadow_l3e_t *sl3e;
+ u32 t = mfn_to_page(smfn)->count_info & PGC_SH_type_mask;
+ mfn_t gmfn, sl3mfn;
+
+ SHADOW_DEBUG(DESTROY_SHADOW,
+ "%s(%05lx)\n", __func__, mfn_x(smfn));
+ ASSERT(t == PGC_SH_l3_shadow);
+
+ /* Record that the guest page isn't shadowed any more (in this type) */
+ gmfn = _mfn(mfn_to_page(smfn)->u.inuse.type_info);
+ delete_shadow_status(v, gmfn, t, smfn);
+ shadow_demote(v, gmfn, t);
+#if GUEST_PAGING_LEVELS == 3
+ /* Take this shadow off the list of root shadows */
+ list_del_init(&mfn_to_page(smfn)->list);
+#endif
+
+ /* Decrement refcounts of all the old entries */
+ sl3mfn = smfn;
+ SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, 0, {
+ if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT )
+ sh_put_ref(v, shadow_l3e_get_mfn(*sl3e),
+ (((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT)
+ | ((unsigned long)sl3e & ~PAGE_MASK));
+ });
+
+ /* Put the memory back in the pool */
+ shadow_free(v->domain, smfn);
+}
+#endif
+
+
+#if GUEST_PAGING_LEVELS == 3
+static void sh_destroy_l3_subshadow(struct vcpu *v,
+ shadow_l3e_t *sl3e)
+/* Tear down just a single 4-entry l3 on a 2-page l3 shadow. */
+{
+ int i;
+ ASSERT((unsigned long)sl3e % (4 * sizeof (shadow_l3e_t)) == 0);
+ for ( i = 0; i < GUEST_L3_PAGETABLE_ENTRIES; i++ )
+ if ( shadow_l3e_get_flags(sl3e[i]) & _PAGE_PRESENT )
+ sh_put_ref(v, shadow_l3e_get_mfn(sl3e[i]),
+ maddr_from_mapped_domain_page(sl3e));
+}
+#endif
+
+#if (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3)
+void sh_unpin_all_l3_subshadows(struct vcpu *v, mfn_t smfn)
+/* Walk a full PAE l3 shadow, unpinning all of the subshadows on it */
+{
+ int i, j;
+ struct pae_l3_bookkeeping *bk;
+
+ ASSERT((mfn_to_page(smfn)->count_info & PGC_SH_type_mask)
+ == PGC_SH_l3_pae_shadow);
+ /* The subshadows are split, 64 on each page of the shadow */
+ for ( i = 0; i < 2; i++ )
+ {
+ void *p = sh_map_domain_page(_mfn(mfn_x(smfn) + i));
+ for ( j = 0; j < 64; j++ )
+ {
+ /* Every second 32-byte region is a bookkeeping entry */
+ bk = (struct pae_l3_bookkeeping *)(p + (64 * j) + 32);
+ if ( bk->pinned )
+ sh_unpin_l3_subshadow(v, (shadow_l3e_t *)(p + (64*j)), smfn);
+ /* Check whether we've just freed the whole shadow */
+ if ( (mfn_to_page(smfn)->count_info & PGC_SH_count_mask) == 0 )
+ {
+ sh_unmap_domain_page(p);
+ return;
+ }
+ }
+ sh_unmap_domain_page(p);
+ }
+}
+#endif
+
+void sh_destroy_l2_shadow(struct vcpu *v, mfn_t smfn)
+{
+ shadow_l2e_t *sl2e;
+ u32 t = mfn_to_page(smfn)->count_info & PGC_SH_type_mask;
+ mfn_t gmfn, sl2mfn;
+ int xen_mappings;
+
+ SHADOW_DEBUG(DESTROY_SHADOW,
+ "%s(%05lx)\n", __func__, mfn_x(smfn));
+ ASSERT(t == PGC_SH_l2_shadow
+ || t == PGC_SH_l2h_pae_shadow);
+
+ /* Record that the guest page isn't shadowed any more (in this type) */
+ gmfn = _mfn(mfn_to_page(smfn)->u.inuse.type_info);
+ delete_shadow_status(v, gmfn, t, smfn);
+ shadow_demote(v, gmfn, t);
+#if GUEST_PAGING_LEVELS == 2
+ /* Take this shadow off the list of root shadows */
+ list_del_init(&mfn_to_page(smfn)->list);
+#endif
+
+ /* Decrement refcounts of all the old entries */
+ sl2mfn = smfn;
+ xen_mappings = (!shadow_mode_external(v->domain) &&
+ ((GUEST_PAGING_LEVELS == 2) ||
+ ((GUEST_PAGING_LEVELS == 3) &&
+ (t == PGC_SH_l2h_pae_shadow))));
+ SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, xen_mappings, {
+ if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT )
+ sh_put_ref(v, shadow_l2e_get_mfn(*sl2e),
+ (((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
+ | ((unsigned long)sl2e & ~PAGE_MASK));
+ });
+
+ /* Put the memory back in the pool */
+ shadow_free(v->domain, smfn);
+}
+
+void sh_destroy_l1_shadow(struct vcpu *v, mfn_t smfn)
+{
+ struct domain *d = v->domain;
+ shadow_l1e_t *sl1e;
+ u32 t = mfn_to_page(smfn)->count_info & PGC_SH_type_mask;
+
+ SHADOW_DEBUG(DESTROY_SHADOW,
+ "%s(%05lx)\n", __func__, mfn_x(smfn));
+ ASSERT(t == PGC_SH_l1_shadow || t == PGC_SH_fl1_shadow);
+
+ /* Record that the guest page isn't shadowed any more (in this type) */
+ if ( t == PGC_SH_fl1_shadow )
+ {
+ gfn_t gfn = _gfn(mfn_to_page(smfn)->u.inuse.type_info);
+ delete_fl1_shadow_status(v, gfn, smfn);
+ }
+ else
+ {
+ mfn_t gmfn = _mfn(mfn_to_page(smfn)->u.inuse.type_info);
+ delete_shadow_status(v, gmfn, t, smfn);
+ shadow_demote(v, gmfn, t);
+ }
+
+ if ( shadow_mode_refcounts(d) )
+ {
+ /* Decrement refcounts of all the old entries */
+ mfn_t sl1mfn = smfn;
+ SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, 0, {
+ if ( shadow_l1e_get_flags(*sl1e) & _PAGE_PRESENT )
+ shadow_put_page_from_l1e(*sl1e, d);
+ });
+ }
+
+ /* Put the memory back in the pool */
+ shadow_free(v->domain, smfn);
+}
+
+#if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
+void sh_destroy_monitor_table(struct vcpu *v, mfn_t mmfn)
+{
+ struct domain *d = v->domain;
+ ASSERT((mfn_to_page(mmfn)->count_info & PGC_SH_type_mask)
+ == PGC_SH_monitor_table);
+
+#if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS != 4)
+ /* Need to destroy the l3 monitor page in slot 0 too */
+ {
+ l4_pgentry_t *l4e = sh_map_domain_page(mmfn);
+ ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT);
+ shadow_free(d, _mfn(l4e_get_pfn(l4e[0])));
+ sh_unmap_domain_page(l4e);
+ }
+#elif CONFIG_PAGING_LEVELS == 3
+ /* Need to destroy the l2 monitor page in slot 4 too */
+ {
+ l3_pgentry_t *l3e = sh_map_domain_page(mmfn);
+ ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
+ shadow_free(d, _mfn(l3e_get_pfn(l3e[3])));
+ sh_unmap_domain_page(l3e);
+ }
+#endif
+
+ /* Put the memory back in the pool */
+ shadow_free(d, mmfn);
+}
+#endif
+
+/**************************************************************************/
+/* Functions to destroy non-Xen mappings in a pagetable hierarchy.
+ * These are called from common code when we are running out of shadow
+ * memory, and unpinning all the top-level shadows hasn't worked.
+ *
+ * This implementation is pretty crude and slow, but we hope that it won't
+ * be called very often. */
+
+#if GUEST_PAGING_LEVELS == 2
+
+void sh_unhook_32b_mappings(struct vcpu *v, mfn_t sl2mfn)
+{
+ shadow_l2e_t *sl2e;
+ int xen_mappings = !shadow_mode_external(v->domain);
+ SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, xen_mappings, {
+ (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
+ });
+}
+
+#elif GUEST_PAGING_LEVELS == 3
+
+void sh_unhook_pae_mappings(struct vcpu *v, mfn_t sl3mfn)
+/* Walk a full PAE l3 shadow, unhooking entries from all the subshadows */
+{
+ shadow_l3e_t *sl3e;
+ SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, 0, {
+ if ( (shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT) ) {
+ mfn_t sl2mfn = shadow_l3e_get_mfn(*sl3e);
+ if ( (mfn_to_page(sl2mfn)->count_info & PGC_SH_type_mask)
+ == PGC_SH_l2h_pae_shadow )
+ {
+ /* High l2: need to pick particular l2es to unhook */
+ shadow_l2e_t *sl2e;
+ SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, 1, {
+ (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
+ });
+ }
+ else
+ {
+ /* Normal l2: can safely unhook the whole l3e */
+ (void) shadow_set_l3e(v, sl3e, shadow_l3e_empty(), sl3mfn);
+ }
+ }
+ });
+ /* We've changed PAE L3 entries: must sync up various copies of them */
+ sh_pae_recopy(v->domain);
+}
+
+#elif GUEST_PAGING_LEVELS == 4
+
+void sh_unhook_64b_mappings(struct vcpu *v, mfn_t sl4mfn)
+{
+ shadow_l4e_t *sl4e;
+ int xen_mappings = !shadow_mode_external(v->domain);
+ SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, xen_mappings, {
+ (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
+ });
+}
+
+#endif
+
+/**************************************************************************/
+/* Internal translation functions.
+ * These functions require a pointer to the shadow entry that will be updated.
+ */
+
+/* These functions take a new guest entry, translate it to shadow and write
+ * the shadow entry.
+ *
+ * They return the same bitmaps as the shadow_set_lXe() functions.
+ */
+
+#if GUEST_PAGING_LEVELS >= 4
+static int validate_gl4e(struct vcpu *v, void *new_ge, mfn_t sl4mfn, void *se)
+{
+ shadow_l4e_t new_sl4e;
+ guest_l4e_t *new_gl4e = new_ge;
+ shadow_l4e_t *sl4p = se;
+ mfn_t sl3mfn = _mfn(INVALID_MFN);
+ int result = 0;
+
+ perfc_incrc(shadow_validate_gl4e_calls);
+
+ if ( guest_l4e_get_flags(*new_gl4e) & _PAGE_PRESENT )
+ {
+ gfn_t gl3gfn = guest_l4e_get_gfn(*new_gl4e);
+ mfn_t gl3mfn = vcpu_gfn_to_mfn(v, gl3gfn);
+ if ( valid_mfn(gl3mfn) )
+ sl3mfn = get_shadow_status(v, gl3mfn, PGC_SH_l3_shadow);
+ else
+ result |= SHADOW_SET_ERROR;
+ }
+ l4e_propagate_from_guest(v, new_gl4e, _mfn(INVALID_MFN),
+ sl3mfn, &new_sl4e, ft_prefetch);
+ result |= shadow_set_l4e(v, sl4p, new_sl4e, sl4mfn);
+ return result;
+}
+#endif // GUEST_PAGING_LEVELS >= 4
+
+#if GUEST_PAGING_LEVELS >= 3
+static int validate_gl3e(struct vcpu *v, void *new_ge, mfn_t sl3mfn, void *se)
+{
+ shadow_l3e_t new_sl3e;
+ guest_l3e_t *new_gl3e = new_ge;
+ shadow_l3e_t *sl3p = se;
+ mfn_t sl2mfn = _mfn(INVALID_MFN);
+ int result = 0;
+
+ perfc_incrc(shadow_validate_gl3e_calls);
+
+ if ( guest_l3e_get_flags(*new_gl3e) & _PAGE_PRESENT )
+ {
+ gfn_t gl2gfn = guest_l3e_get_gfn(*new_gl3e);
+ mfn_t gl2mfn = vcpu_gfn_to_mfn(v, gl2gfn);
+ if ( valid_mfn(gl2mfn) )
+ sl2mfn = get_shadow_status(v, gl2mfn, PGC_SH_l2_shadow);
+ else
+ result |= SHADOW_SET_ERROR;
+ }
+ l3e_propagate_from_guest(v, new_gl3e, _mfn(INVALID_MFN),
+ sl2mfn, &new_sl3e, ft_prefetch);
+ result |= shadow_set_l3e(v, sl3p, new_sl3e, sl3mfn);
+
+#if GUEST_PAGING_LEVELS == 3
+ /* We have changed a PAE l3 entry: need to sync up the possible copies
+ * of it */
+ if ( result & SHADOW_SET_L3PAE_RECOPY )
+ sh_pae_recopy(v->domain);
+#endif
+
+ return result;
+}
+#endif // GUEST_PAGING_LEVELS >= 3
+
+static int validate_gl2e(struct vcpu *v, void *new_ge, mfn_t sl2mfn, void *se)
+{
+ shadow_l2e_t new_sl2e;
+ guest_l2e_t *new_gl2e = new_ge;
+ shadow_l2e_t *sl2p = se;
+ mfn_t sl1mfn = _mfn(INVALID_MFN);
+ int result = 0;
+
+ perfc_incrc(shadow_validate_gl2e_calls);
+
+ if ( guest_l2e_get_flags(*new_gl2e) & _PAGE_PRESENT )
+ {
+ gfn_t gl1gfn = guest_l2e_get_gfn(*new_gl2e);
+ if ( guest_supports_superpages(v) &&
+ (guest_l2e_get_flags(*new_gl2e) & _PAGE_PSE) )
+ {
+ // superpage -- need to look up the shadow L1 which holds the
+ // splitters...
+ sl1mfn = get_fl1_shadow_status(v, gl1gfn);
+#if 0
+ // XXX - it's possible that we want to do some kind of prefetch
+ // for superpage fl1's here, but this is *not* on the demand path,
+ // so we'll hold off trying that for now...
+ //
+ if ( !valid_mfn(sl1mfn) )
+ sl1mfn = make_fl1_shadow(v, gl1gfn);
+#endif
+ }
+ else
+ {
+ mfn_t gl1mfn = vcpu_gfn_to_mfn(v, gl1gfn);
+ if ( valid_mfn(gl1mfn) )
+ sl1mfn = get_shadow_status(v, gl1mfn, PGC_SH_l1_shadow);
+ else
+ result |= SHADOW_SET_ERROR;
+ }
+ }
+ l2e_propagate_from_guest(v, new_gl2e, _mfn(INVALID_MFN),
+ sl1mfn, &new_sl2e, ft_prefetch);
+ result |= shadow_set_l2e(v, sl2p, new_sl2e, sl2mfn);
+
+ return result;
+}
+
+static int validate_gl1e(struct vcpu *v, void *new_ge, mfn_t sl1mfn, void *se)
+{
+ shadow_l1e_t new_sl1e;
+ guest_l1e_t *new_gl1e = new_ge;
+ shadow_l1e_t *sl1p = se;
+ gfn_t gfn;
+ mfn_t mfn;
+ int result = 0;
+
+ perfc_incrc(shadow_validate_gl1e_calls);
+
+ gfn = guest_l1e_get_gfn(*new_gl1e);
+ mfn = vcpu_gfn_to_mfn(v, gfn);
+
+ l1e_propagate_from_guest(v, *new_gl1e, &new_sl1e,
+ /* mmio? */ !valid_mfn(mfn));
+
+ result |= shadow_set_l1e(v, sl1p, new_sl1e, sl1mfn);
+ return result;
+}
+
+
+/**************************************************************************/
+/* Functions which translate and install a the shadows of arbitrary guest
+ * entries that we have just seen the guest write. */
+
+
+static inline int
+sh_map_and_validate(struct vcpu *v, mfn_t gmfn,
+ void *new_gp, u32 size, u32 sh_type,
+ u32 (*shadow_index)(mfn_t *smfn, u32 idx),
+ int (*validate_ge)(struct vcpu *v, void *ge,
+ mfn_t smfn, void *se))
+/* Generic function for mapping and validating. */
+{
+ mfn_t smfn, smfn2, map_mfn;
+ shadow_l1e_t *sl1p;
+ u32 shadow_idx, guest_idx;
+ int result = 0;
+
+ /* Align address and size to guest entry boundaries */
+ size += (unsigned long)new_gp & (sizeof (guest_l1e_t) - 1);
+ new_gp = (void *)((unsigned long)new_gp & ~(sizeof (guest_l1e_t) - 1));
+ size = (size + sizeof (guest_l1e_t) - 1) & ~(sizeof (guest_l1e_t) - 1);
+ ASSERT(size + (((unsigned long)new_gp) & ~PAGE_MASK) <= PAGE_SIZE);
+
+ /* Map the shadow page */
+ smfn = get_shadow_status(v, gmfn, sh_type);
+ ASSERT(valid_mfn(smfn)); /* Otherwise we would not have been called */
+ guest_idx = guest_index(new_gp);
+ map_mfn = smfn;
+ shadow_idx = shadow_index(&map_mfn, guest_idx);
+ sl1p = map_shadow_page(map_mfn);
+
+ /* Validate one entry at a time */
+ while ( size )
+ {
+ smfn2 = smfn;
+ guest_idx = guest_index(new_gp);
+ shadow_idx = shadow_index(&smfn2, guest_idx);
+ if ( mfn_x(smfn2) != mfn_x(map_mfn) )
+ {
+ /* We have moved to another page of the shadow */
+ map_mfn = smfn2;
+ unmap_shadow_page(sl1p);
+ sl1p = map_shadow_page(map_mfn);
+ }
+ result |= validate_ge(v,
+ new_gp,
+ map_mfn,
+ &sl1p[shadow_idx]);
+ size -= sizeof(guest_l1e_t);
+ new_gp += sizeof(guest_l1e_t);
+ }
+ unmap_shadow_page(sl1p);
+ return result;
+}
+
+
+int
+sh_map_and_validate_gl4e(struct vcpu *v, mfn_t gl4mfn,
+ void *new_gl4p, u32 size)
+{
+#if GUEST_PAGING_LEVELS >= 4
+ return sh_map_and_validate(v, gl4mfn, new_gl4p, size,
+ PGC_SH_l4_shadow,
+ shadow_l4_index,
+ validate_gl4e);
+#else // ! GUEST_PAGING_LEVELS >= 4
+ SHADOW_PRINTK("called in wrong paging mode!\n");
+ BUG();
+ return 0;
+#endif
+}
+
+int
+sh_map_and_validate_gl3e(struct vcpu *v, mfn_t gl3mfn,
+ void *new_gl3p, u32 size)
+{
+#if GUEST_PAGING_LEVELS >= 3
+ return sh_map_and_validate(v, gl3mfn, new_gl3p, size,
+ PGC_SH_l3_shadow,
+ shadow_l3_index,
+ validate_gl3e);
+#else // ! GUEST_PAGING_LEVELS >= 3
+ SHADOW_PRINTK("called in wrong paging mode!\n");
+ BUG();
+ return 0;
+#endif
+}
+
+int
+sh_map_and_validate_gl2e(struct vcpu *v, mfn_t gl2mfn,
+ void *new_gl2p, u32 size)
+{
+ return sh_map_and_validate(v, gl2mfn, new_gl2p, size,
+ PGC_SH_l2_shadow,
+ shadow_l2_index,
+ validate_gl2e);
+}
+
+int
+sh_map_and_validate_gl2he(struct vcpu *v, mfn_t gl2mfn,
+ void *new_gl2p, u32 size)
+{
+#if GUEST_PAGING_LEVELS == 3
+ return sh_map_and_validate(v, gl2mfn, new_gl2p, size,
+ PGC_SH_l2h_shadow,
+ shadow_l2_index,
+ validate_gl2e);
+#else /* Non-PAE guests don't have different kinds of l2 table */
+ SHADOW_PRINTK("called in wrong paging mode!\n");
+ BUG();
+ return 0;
+#endif
+}
+
+int
+sh_map_and_validate_gl1e(struct vcpu *v, mfn_t gl1mfn,
+ void *new_gl1p, u32 size)
+{
+ return sh_map_and_validate(v, gl1mfn, new_gl1p, size,
+ PGC_SH_l1_shadow,
+ shadow_l1_index,
+ validate_gl1e);
+}
+
+
+/**************************************************************************/
+/* Optimization: If we see two emulated writes of zeros to the same
+ * page-table without another kind of page fault in between, we guess
+ * that this is a batch of changes (for process destruction) and
+ * unshadow the page so we don't take a pagefault on every entry. This
+ * should also make finding writeable mappings of pagetables much
+ * easier. */
+
+/* Look to see if this is the second emulated write in a row to this
+ * page, and unshadow/unhook if it is */
+static inline void check_for_early_unshadow(struct vcpu *v, mfn_t gmfn)
+{
+#if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
+ if ( v->arch.shadow.last_emulated_mfn == mfn_x(gmfn) &&
+ sh_mfn_is_a_page_table(gmfn) )
+ {
+ u32 flags = mfn_to_page(gmfn)->shadow_flags;
+ mfn_t smfn;
+ if ( !(flags & (SHF_L2_32|SHF_L3_PAE|SHF_L4_64)) )
+ {
+ perfc_incrc(shadow_early_unshadow);
+ sh_remove_shadows(v, gmfn, 0 /* Can fail to unshadow */ );
+ return;
+ }
+ /* SHF_unhooked_mappings is set to make sure we only unhook
+ * once in a single batch of updates. It is reset when this
+ * top-level page is loaded into CR3 again */
+ if ( !(flags & SHF_unhooked_mappings) )
+ {
+ perfc_incrc(shadow_early_unshadow_top);
+ mfn_to_page(gmfn)->shadow_flags |= SHF_unhooked_mappings;
+ if ( flags & SHF_L2_32 )
+ {
+ smfn = get_shadow_status(v, gmfn, PGC_SH_l2_32_shadow);
+ shadow_unhook_mappings(v, smfn);
+ }
+ if ( flags & SHF_L3_PAE )
+ {
+ smfn = get_shadow_status(v, gmfn, PGC_SH_l3_pae_shadow);
+ shadow_unhook_mappings(v, smfn);
+ }
+ if ( flags & SHF_L4_64 )
+ {
+ smfn = get_shadow_status(v, gmfn, PGC_SH_l4_64_shadow);
+ shadow_unhook_mappings(v, smfn);
+ }
+ }
+ }
+ v->arch.shadow.last_emulated_mfn = mfn_x(gmfn);
+#endif
+}
+
+/* Stop counting towards early unshadows, as we've seen a real page fault */
+static inline void reset_early_unshadow(struct vcpu *v)
+{
+#if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
+ v->arch.shadow.last_emulated_mfn = INVALID_MFN;
+#endif
+}
+
+
+
+/**************************************************************************/
+/* Entry points into the shadow code */
+
+/* Called from pagefault handler in Xen, and from the HVM trap handlers
+ * for pagefaults. Returns 1 if this fault was an artefact of the
+ * shadow code (and the guest should retry) or 0 if it is not (and the
+ * fault should be handled elsewhere or passed to the guest). */
+
+static int sh_page_fault(struct vcpu *v,
+ unsigned long va,
+ struct cpu_user_regs *regs)
+{
+ struct domain *d = v->domain;
+ walk_t gw;
+ u32 accumulated_gflags;
+ gfn_t gfn;
+ mfn_t gmfn, sl1mfn=_mfn(0);
+ shadow_l1e_t sl1e, *ptr_sl1e;
+ paddr_t gpa;
+ struct cpu_user_regs emul_regs;
+ struct x86_emulate_ctxt emul_ctxt;
+ int r, mmio;
+ fetch_type_t ft = 0;
+
+ //
+ // XXX: Need to think about eventually mapping superpages directly in the
+ // shadow (when possible), as opposed to splintering them into a
+ // bunch of 4K maps.
+ //
+
+ SHADOW_PRINTK("d:v=%u:%u va=%#lx err=%u\n",
+ v->domain->domain_id, v->vcpu_id, va, regs->error_code);
+
+ shadow_lock(d);
+
+ shadow_audit_tables(v);
+
+ if ( guest_walk_tables(v, va, &gw, 1) != 0 )
+ {
+ SHADOW_PRINTK("malformed guest pagetable!");
+ print_gw(&gw);
+ }
+
+ sh_audit_gw(v, &gw);
+
+ // We do not look at the gw->l1e, as that will not exist for superpages.
+ // Instead, we use the gw->eff_l1e...
+ //
+ // We need not check all the levels of the guest page table entries for
+ // present vs not-present, as the eff_l1e will always be not present if
+ // one of the higher level entries is not present.
+ //
+ if ( unlikely(!(guest_l1e_get_flags(gw.eff_l1e) & _PAGE_PRESENT)) )
+ {
+ if ( hvm_guest(v) && !shadow_vcpu_mode_translate(v) )
+ {
+ /* Not present in p2m map, means this is mmio */
+ gpa = va;
+ goto mmio;
+ }
+
+ perfc_incrc(shadow_fault_bail_not_present);
+ goto not_a_shadow_fault;
+ }
+
+ // All levels of the guest page table are now known to be present.
+ accumulated_gflags = accumulate_guest_flags(&gw);
+
+ // Check for attempts to access supervisor-only pages from user mode,
+ // i.e. ring 3. Such errors are not caused or dealt with by the shadow
+ // code.
+ //
+ if ( (regs->error_code & PFEC_user_mode) &&
+ !(accumulated_gflags & _PAGE_USER) )
+ {
+ /* illegal user-mode access to supervisor-only page */
+ perfc_incrc(shadow_fault_bail_user_supervisor);
+ goto not_a_shadow_fault;
+ }
+
+ // Was it a write fault?
+ //
+ if ( regs->error_code & PFEC_write_access )
+ {
+ if ( unlikely(!(accumulated_gflags & _PAGE_RW)) )
+ {
+ perfc_incrc(shadow_fault_bail_ro_mapping);
+ goto not_a_shadow_fault;
+ }
+ }
+ else // must have been either an insn fetch or read fault
+ {
+ // Check for NX bit violations: attempts to execute code that is
+ // marked "do not execute". Such errors are not caused or dealt with
+ // by the shadow code.
+ //
+ if ( regs->error_code & PFEC_insn_fetch )
+ {
+ if ( accumulated_gflags & _PAGE_NX_BIT )
+ {
+ /* NX prevented this code fetch */
+ perfc_incrc(shadow_fault_bail_nx);
+ goto not_a_shadow_fault;
+ }
+ }
+ }
+
+ /* Is this an MMIO access? */
+ gfn = guest_l1e_get_gfn(gw.eff_l1e);
+ mmio = ( hvm_guest(v)
+ && shadow_vcpu_mode_translate(v)
+ && mmio_space(gfn_to_paddr(gfn)) );
+
+ /* For MMIO, the shadow holds the *gfn*; for normal accesses, if holds
+ * the equivalent mfn. */
+ if ( mmio )
+ gmfn = _mfn(gfn_x(gfn));
+ else
+ {
+ gmfn = vcpu_gfn_to_mfn(v, gfn);
+ if ( !valid_mfn(gmfn) )
+ {
+ perfc_incrc(shadow_fault_bail_bad_gfn);
+ SHADOW_PRINTK("BAD gfn=%"SH_PRI_gfn" gmfn=%"SH_PRI_mfn"\n",
+ gfn_x(gfn), mfn_x(gmfn));
+ goto not_a_shadow_fault;
+ }
+ }
+
+ /* Make sure there is enough free shadow memory to build a chain of
+ * shadow tables: one SHADOW_MAX_ORDER chunk will always be enough
+ * to allocate all we need. (We never allocate a top-level shadow
+ * on this path, only a 32b l1, pae l2+1 or 64b l3+2+1) */
+ shadow_prealloc(d, SHADOW_MAX_ORDER);
+
+ /* Acquire the shadow. This must happen before we figure out the rights
+ * for the shadow entry, since we might promote a page here. */
+ // XXX -- this code will need to change somewhat if/when the shadow code
+ // can directly map superpages...
+ ft = ((regs->error_code & PFEC_write_access) ?
+ ft_demand_write : ft_demand_read);
+ ptr_sl1e = shadow_get_and_create_l1e(v, &gw, &sl1mfn, ft);
+ ASSERT(ptr_sl1e);
+
+ /* Calculate the shadow entry */
+ if ( ft == ft_demand_write )
+ {
+ if ( l1e_write_fault(v, &gw, gmfn, &sl1e, mmio) )
+ {
+ perfc_incrc(shadow_fault_emulate_write);
+ goto emulate;
+ }
+ }
+ else if ( l1e_read_fault(v, &gw, gmfn, &sl1e, mmio) )
+ {
+ perfc_incrc(shadow_fault_emulate_read);
+ goto emulate;
+ }
+
+ /* Quick sanity check: we never make an MMIO entry that's got the
+ * _PAGE_PRESENT flag set in it. */
+ ASSERT(!mmio || !(shadow_l1e_get_flags(sl1e) & _PAGE_PRESENT));
+
+ r = shadow_set_l1e(v, ptr_sl1e, sl1e, sl1mfn);
+
+ if ( mmio )
+ {
+ gpa = guest_walk_to_gpa(&gw);
+ goto mmio;
+ }
+
+#if 0
+ if ( !(r & SHADOW_SET_CHANGED) )
+ debugtrace_printk("%s: shadow_set_l1e(va=%p, sl1e=%" SH_PRI_pte
+ ") did not change anything\n",
+ __func__, gw.va, l1e_get_intpte(sl1e));
+#endif
+
+ perfc_incrc(shadow_fault_fixed);
+ d->arch.shadow.fault_count++;
+ reset_early_unshadow(v);
+
+ done:
+ sh_audit_gw(v, &gw);
+ unmap_walk(v, &gw);
+ SHADOW_PRINTK("fixed\n");
+ shadow_audit_tables(v);
+ shadow_unlock(d);
+ return EXCRET_fault_fixed;
+
+ emulate:
+
+ /* Take the register set we were called with */
+ emul_regs = *regs;
+ if ( hvm_guest(v) )
+ {
+ /* Add the guest's segment selectors, rip, rsp. rflags */
+ hvm_store_cpu_guest_regs(v, &emul_regs, NULL);
+ }
+ emul_ctxt.regs = &emul_regs;
+ emul_ctxt.cr2 = va;
+ emul_ctxt.mode = hvm_guest(v) ? hvm_guest_x86_mode(v) : X86EMUL_MODE_HOST;
+
+ SHADOW_PRINTK("emulate: eip=%#lx\n", emul_regs.eip);
+
+ v->arch.shadow.propagate_fault = 0;
+ if ( x86_emulate_memop(&emul_ctxt, &shadow_emulator_ops) )
+ {
+ SHADOW_PRINTK("emulator failure, unshadowing mfn %#lx\n",
+ mfn_x(gmfn));
+ perfc_incrc(shadow_fault_emulate_failed);
+ /* If this is actually a page table, then we have a bug, and need
+ * to support more operations in the emulator. More likely,
+ * though, this is a hint that this page should not be shadowed. */
+ shadow_remove_all_shadows(v, gmfn);
+ /* This means that actual missing operations will cause the
+ * guest to loop on the same page fault. */
+ goto done;
+ }
+ if ( v->arch.shadow.propagate_fault )
+ {
+ /* Emulation triggered another page fault */
+ goto not_a_shadow_fault;
+ }
+
+ /* Emulator has changed the user registers: write back */
+ if ( hvm_guest(v) )
+ {
+ /* Write back the guest's segment selectors, rip, rsp. rflags */
+ hvm_load_cpu_guest_regs(v, &emul_regs);
+ /* And don't overwrite those in the caller's regs. */
+ emul_regs.eip = regs->eip;
+ emul_regs.cs = regs->cs;
+ emul_regs.eflags = regs->eflags;
+ emul_regs.esp = regs->esp;
+ emul_regs.ss = regs->ss;
+ emul_regs.es = regs->es;
+ emul_regs.ds = regs->ds;
+ emul_regs.fs = regs->fs;
+ emul_regs.gs = regs->gs;
+ }
+ *regs = emul_regs;
+
+ goto done;
+
+ mmio:
+ perfc_incrc(shadow_fault_mmio);
+ if ( !hvm_apic_support(d) && (gpa >= 0xFEC00000) )
+ {
+ /* Need to deal with these disabled-APIC accesses, as
+ * handle_mmio() apparently does not currently do that. */
+ /* TJD: What about it, then? For now, I'm turning this BUG()
+ * into a domain_crash() since we don't want to kill Xen. */
+ SHADOW_ERROR("disabled-APIC access: not supported\n.");
+ domain_crash(d);
+ }
+ sh_audit_gw(v, &gw);
+ unmap_walk(v, &gw);
+ SHADOW_PRINTK("mmio\n");
+ shadow_audit_tables(v);
+ reset_early_unshadow(v);
+ shadow_unlock(d);
+ sh_log_mmio(v, gpa);
+ handle_mmio(va, gpa);
+ return EXCRET_fault_fixed;
+
+ not_a_shadow_fault:
+ sh_audit_gw(v, &gw);
+ unmap_walk(v, &gw);
+ SHADOW_PRINTK("not a shadow fault\n");
+ shadow_audit_tables(v);
+ reset_early_unshadow(v);
+ shadow_unlock(d);
+ return 0;
+}
+
+
+static int
+sh_invlpg(struct vcpu *v, unsigned long va)
+/* Called when the guest requests an invlpg. Returns 1 if the invlpg
+ * instruction should be issued on the hardware, or 0 if it's safe not
+ * to do so. */
+{
+ shadow_l2e_t *ptr_sl2e = shadow_get_l2e(v, va);
+
+ // XXX -- might be a good thing to prefetch the va into the shadow
+
+ // no need to flush anything if there's no SL2...
+ //
+ if ( !ptr_sl2e )
+ return 0;
+
+ // If there's nothing shadowed for this particular sl2e, then
+ // there is no need to do an invlpg, either...
+ //
+ if ( !(shadow_l2e_get_flags(*ptr_sl2e) & _PAGE_PRESENT) )
+ return 0;
+
+ // Check to see if the SL2 is a splintered superpage...
+ // If so, then we'll need to flush the entire TLB (because that's
+ // easier than invalidating all of the individual 4K pages).
+ //
+ if ( (mfn_to_page(shadow_l2e_get_mfn(*ptr_sl2e))->count_info &
+ PGC_SH_type_mask) == PGC_SH_fl1_shadow )
+ {
+ local_flush_tlb();
+ return 0;
+ }
+
+ return 1;
+}
+
+static unsigned long
+sh_gva_to_gfn(struct vcpu *v, unsigned long va)
+/* Called to translate a guest virtual address to what the *guest*
+ * pagetables would map it to. */
+{
+ walk_t gw;
+ gfn_t gfn;
+
+ guest_walk_tables(v, va, &gw, 0);
+ gfn = guest_walk_to_gfn(&gw);
+ unmap_walk(v, &gw);
+
+ return gfn_x(gfn);
+}
+
+
+static unsigned long
+sh_gva_to_gpa(struct vcpu *v, unsigned long va)
+/* Called to translate a guest virtual address to what the *guest*
+ * pagetables would map it to. */
+{
+ unsigned long gfn = sh_gva_to_gfn(v, va);
+ if ( gfn == INVALID_GFN )
+ return 0;
+ else
+ return (gfn << PAGE_SHIFT) | (va & ~PAGE_MASK);
+}
+
+
+// XXX -- should this be in this file?
+// Or should it be moved to shadow-common.c?
+//
+/* returns a lowmem machine address of the copied HVM L3 root table
+ * If clear_res != 0, then clear the PAE-l3 reserved bits in the copy,
+ * otherwise blank out any entries with reserved bits in them. */
+#if (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3)
+static unsigned long
+hvm_pae_copy_root(struct vcpu *v, l3_pgentry_t *l3tab, int clear_res)
+{
+ int i, f;
+ int res = (_PAGE_RW|_PAGE_NX_BIT|_PAGE_USER|_PAGE_ACCESSED|_PAGE_DIRTY);
+ l3_pgentry_t new_l3e, *copy = v->arch.hvm_vcpu.hvm_lowmem_l3tab;
+ memcpy(copy, l3tab, 4 * sizeof(l3_pgentry_t));
+ for ( i = 0; i < 4; i++ )
+ {
+ f = l3e_get_flags(l3tab[i]);
+ if ( (f & _PAGE_PRESENT) && (!(f & res) || clear_res) )
+ new_l3e = l3e_from_pfn(l3e_get_pfn(l3tab[i]), f & ~res);
+ else
+ new_l3e = l3e_empty();
+ safe_write_entry(©[i], &new_l3e);
+ }
+ return __pa(copy);
+}
+#endif
+
+
+static inline void
+sh_update_linear_entries(struct vcpu *v)
+/* Sync up all the linear mappings for this vcpu's pagetables */
+{
+ struct domain *d = v->domain;
+
+ /* Linear pagetables in PV guests
+ * ------------------------------
+ *
+ * Guest linear pagetables, which map the guest pages, are at
+ * LINEAR_PT_VIRT_START. Shadow linear pagetables, which map the
+ * shadows, are at SH_LINEAR_PT_VIRT_START. Most of the time these
+ * are set up at shadow creation time, but (of course!) the PAE case
+ * is subtler. Normal linear mappings are made by having an entry
+ * in the top-level table that points to itself (shadow linear) or
+ * to the guest top-level table (guest linear). For PAE, to set up
+ * a linear map requires us to copy the four top-level entries into
+ * level-2 entries. That means that every time we change a PAE l3e,
+ * we need to reflect the change into the copy.
+ *
+ * Linear pagetables in HVM guests
+ * -------------------------------
+ *
+ * For HVM guests, the linear pagetables are installed in the monitor
+ * tables (since we can't put them in the shadow). Shadow linear
+ * pagetables, which map the shadows, are at SH_LINEAR_PT_VIRT_START,
+ * and we use the linear pagetable slot at LINEAR_PT_VIRT_START for
+ * a linear pagetable of the monitor tables themselves. We have
+ * the same issue of having to re-copy PAE l3 entries whevever we use
+ * PAE shadows.
+ *
+ * Because HVM guests run on the same monitor tables regardless of the
+ * shadow tables in use, the linear mapping of the shadow tables has to
+ * be updated every time v->arch.shadow_table changes.
+ */
+
+ /* Don't try to update the monitor table if it doesn't exist */
+ if ( shadow_mode_external(d)
+ && pagetable_get_pfn(v->arch.monitor_table) == 0 )
+ return;
+
+#if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 4)
+
+ /* For PV, one l4e points at the guest l4, one points at the shadow
+ * l4. No maintenance required.
+ * For HVM, just need to update the l4e that points to the shadow l4. */
+
+ if ( shadow_mode_external(d) )
+ {
+ /* Use the linear map if we can; otherwise make a new mapping */
+ if ( v == current )
+ {
+ __linear_l4_table[l4_linear_offset(SH_LINEAR_PT_VIRT_START)] =
+ l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table),
+ __PAGE_HYPERVISOR);
+ }
+ else
+ {
+ l4_pgentry_t *ml4e;
+ ml4e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
+ ml4e[l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
+ l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table),
+ __PAGE_HYPERVISOR);
+ sh_unmap_domain_page(ml4e);
+ }
+ }
+
+#elif (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 3)
+
+ /* This case only exists in HVM. To give ourselves a linear map of the
+ * shadows, we need to extend a PAE shadow to 4 levels. We do this by
+ * having a monitor l3 in slot 0 of the monitor l4 table, and
+ * copying the PAE l3 entries into it. Then, by having the monitor l4e
+ * for shadow pagetables also point to the monitor l4, we can use it
+ * to access the shadows. */
+
+ if ( shadow_mode_external(d) )
+ {
+ /* Install copies of the shadow l3es into the monitor l3 table.
+ * The monitor l3 table is hooked into slot 0 of the monitor
+ * l4 table, so we use l3 linear indices 0 to 3 */
+ shadow_l3e_t *sl3e;
+ l3_pgentry_t *ml3e;
+ mfn_t l3mfn;
+ int i;
+
+ /* Use linear mappings if we can; otherwise make new mappings */
+ if ( v == current )
+ {
+ ml3e = __linear_l3_table;
+ l3mfn = _mfn(l4e_get_pfn(__linear_l4_table[0]));
+#if GUEST_PAGING_LEVELS == 2
+ /* Shadow l3 tables are made up by update_cr3 */
+ sl3e = v->arch.hvm_vcpu.hvm_lowmem_l3tab;
+#else
+ sl3e = v->arch.shadow_vtable;
+#endif
+ }
+ else
+ {
+ l4_pgentry_t *ml4e;
+ ml4e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
+ ASSERT(l4e_get_flags(ml4e[0]) & _PAGE_PRESENT);
+ l3mfn = _mfn(l4e_get_pfn(ml4e[0]));
+ ml3e = sh_map_domain_page(l3mfn);
+ sh_unmap_domain_page(ml4e);
+#if GUEST_PAGING_LEVELS == 2
+ /* Shadow l3 tables are made up by update_cr3 */
+ sl3e = v->arch.hvm_vcpu.hvm_lowmem_l3tab;
+#else
+ sl3e = sh_map_domain_page(pagetable_get_mfn(v->arch.shadow_table));
+#endif
+ }
+
+ for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
+ {
+ ml3e[i] =
+ (shadow_l3e_get_flags(sl3e[i]) & _PAGE_PRESENT)
+ ? l3e_from_pfn(mfn_x(shadow_l3e_get_mfn(sl3e[i])),
+ __PAGE_HYPERVISOR)
+ : l3e_empty();
+ }
+
+ if ( v != current )
+ {
+ sh_unmap_domain_page(ml3e);
+#if GUEST_PAGING_LEVELS != 2
+ sh_unmap_domain_page(sl3e);
+#endif
+ }
+ }
+
+#elif CONFIG_PAGING_LEVELS == 3
+
+ /* PV: need to copy the guest's l3 entries into the guest-linear-map l2
+ * entries in the shadow, and the shadow's l3 entries into the
+ * shadow-linear-map l2 entries in the shadow. This is safe to do
+ * because Xen does not let guests share high-slot l2 tables between l3s,
+ * so we know we're not treading on anyone's toes.
+ *
+ * HVM: need to copy the shadow's l3 entries into the
+ * shadow-linear-map l2 entries in the monitor table. This is safe
+ * because we have one monitor table for each vcpu. The monitor's
+ * own l3es don't need to be copied because they never change.
+ * XXX That might change if we start stuffing things into the rest
+ * of the monitor's virtual address space.
+ */
+ {
+ l2_pgentry_t *l2e, new_l2e;
+ shadow_l3e_t *guest_l3e = NULL, *shadow_l3e;
+ int i;
+
+#if GUEST_PAGING_LEVELS == 2
+ /* Shadow l3 tables were built by update_cr3 */
+ if ( shadow_mode_external(d) )
+ shadow_l3e = v->arch.hvm_vcpu.hvm_lowmem_l3tab;
+ else
+ BUG(); /* PV 2-on-3 is not supported yet */
+
+#else /* GUEST_PAGING_LEVELS == 3 */
+
+ /* Use local vcpu's mappings if we can; otherwise make new mappings */
+ if ( v == current )
+ {
+ shadow_l3e = v->arch.shadow_vtable;
+ if ( !shadow_mode_external(d) )
+ guest_l3e = v->arch.guest_vtable;
+ }
+ else
+ {
+ mfn_t smfn;
+ int idx;
+
+ /* Map the shadow l3 */
+ smfn = pagetable_get_mfn(v->arch.shadow_table);
+ idx = shadow_l3_index(&smfn, guest_index(v->arch.shadow_vtable));
+ shadow_l3e = sh_map_domain_page(smfn);
+ shadow_l3e += idx;
+ if ( !shadow_mode_external(d) )
+ {
+ /* Also the guest l3 */
+ mfn_t gmfn = pagetable_get_mfn(v->arch.guest_table);
+ guest_l3e = sh_map_domain_page(gmfn);
+ guest_l3e += guest_index(v->arch.guest_vtable);
+ }
+ }
+#endif /* GUEST_PAGING_LEVELS */
+
+ /* Choose where to write the entries, using linear maps if possible */
+ if ( v == current && shadow_mode_external(d) )
+ {
+ /* From the monitor tables, it's safe to use linear maps to update
+ * monitor l2s */
+ l2e = __linear_l2_table + (3 * L2_PAGETABLE_ENTRIES);
+ }
+ else if ( shadow_mode_external(d) )
+ {
+ /* Map the monitor table's high l2 */
+ l3_pgentry_t *l3e;
+ l3e = sh_map_domain_page(
+ pagetable_get_mfn(v->arch.monitor_table));
+ ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
+ l2e = sh_map_domain_page(_mfn(l3e_get_pfn(l3e[3])));
+ sh_unmap_domain_page(l3e);
+ }
+ else
+ {
+ /* Map the shadow table's high l2 */
+ ASSERT(shadow_l3e_get_flags(shadow_l3e[3]) & _PAGE_PRESENT);
+ l2e = sh_map_domain_page(shadow_l3e_get_mfn(shadow_l3e[3]));
+ }
+
+
+ if ( !shadow_mode_external(d) )
+ {
+ /* Write linear mapping of guest. */
+ for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
+ {
+ new_l2e = (shadow_l3e_get_flags(guest_l3e[i]) & _PAGE_PRESENT)
+ ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(guest_l3e[i])),
+ __PAGE_HYPERVISOR)
+ : l2e_empty();
+ safe_write_entry(
+ &l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i],
+ &new_l2e);
+ }
+ }
+
+ /* Write linear mapping of shadow. */
+ for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
+ {
+ new_l2e = (shadow_l3e_get_flags(shadow_l3e[i]) & _PAGE_PRESENT)
+ ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(shadow_l3e[i])),
+ __PAGE_HYPERVISOR)
+ : l2e_empty();
+ safe_write_entry(
+ &l2e[l2_table_offset(SH_LINEAR_PT_VIRT_START) + i],
+ &new_l2e);
+ }
+
+ if ( v != current || !shadow_mode_external(d) )
+ sh_unmap_domain_page(l2e);
+
+#if GUEST_PAGING_LEVELS == 3
+ if ( v != current)
+ {
+ sh_unmap_domain_page(shadow_l3e);
+ if ( !shadow_mode_external(d) )
+ sh_unmap_domain_page(guest_l3e);
+ }
+#endif
+ }
+
+#elif CONFIG_PAGING_LEVELS == 2
+
+ /* For PV, one l2e points at the guest l2, one points at the shadow
+ * l2. No maintenance required.
+ * For HVM, just need to update the l2e that points to the shadow l2. */
+
+ if ( shadow_mode_external(d) )
+ {
+ /* Use the linear map if we can; otherwise make a new mapping */
+ if ( v == current )
+ {
+ __linear_l2_table[l2_linear_offset(SH_LINEAR_PT_VIRT_START)] =
+ l2e_from_pfn(pagetable_get_pfn(v->arch.shadow_table),
+ __PAGE_HYPERVISOR);
+ }
+ else
+ {
+ l2_pgentry_t *ml2e;
+ ml2e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
+ ml2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
+ l2e_from_pfn(pagetable_get_pfn(v->arch.shadow_table),
+ __PAGE_HYPERVISOR);
+ sh_unmap_domain_page(ml2e);
+ }
+ }
+
+#else
+#error this should not happen
+#endif
+}
+
+
+// XXX -- should this be in this file?
+// Or should it be moved to shadow-common.c?
+//
+#if (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3)
+void sh_pae_recopy(struct domain *d)
+/* Called whenever we write to the l3 entries of a PAE pagetable which
+ * is currently in use. Each vcpu that is using the table needs to
+ * resync its copies of the l3s in linear maps and any low-memory
+ * copies it might have made for fitting into 32bit CR3.
+ * Since linear maps are also resynced when we change CR3, we don't
+ * need to worry about changes to PAE l3es that are not currently in use.*/
+{
+ struct vcpu *v;
+ cpumask_t flush_mask = CPU_MASK_NONE;
+ ASSERT(shadow_lock_is_acquired(d));
+
+ for_each_vcpu(d, v)
+ {
+ if ( !v->arch.shadow.pae_flip_pending )
+ continue;
+
+ cpu_set(v->processor, flush_mask);
+
+ SHADOW_PRINTK("d=%u v=%u\n", v->domain->domain_id, v->vcpu_id);
+
+ /* This vcpu has a copy in its linear maps */
+ sh_update_linear_entries(v);
+ if ( hvm_guest(v) )
+ {
+ /* This vcpu has a copy in its HVM PAE l3 */
+ v->arch.hvm_vcpu.hw_cr3 =
+ hvm_pae_copy_root(v, v->arch.shadow_vtable,
+ !shadow_vcpu_mode_translate(v));
+ }
+#if CONFIG_PAGING_LEVELS == 3
+ else
+ {
+ /* This vcpu might have copied the l3 to below 4GB */
+ if ( v->arch.cr3 >> PAGE_SHIFT
+ != pagetable_get_pfn(v->arch.shadow_table) )
+ {
+ /* Recopy to where that copy is. */
+ int i;
+ l3_pgentry_t *dst, *src;
+ dst = __va(v->arch.cr3 & ~0x1f); /* Mask cache control bits */
+ src = v->arch.shadow_vtable;
+ for ( i = 0 ; i < 4 ; i++ )
+ safe_write_entry(dst + i, src + i);
+ }
+ }
+#endif
+ v->arch.shadow.pae_flip_pending = 0;
+ }
+
+ flush_tlb_mask(flush_mask);
+}
+#endif /* (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3) */
+
+
+/* removes:
+ * vcpu->arch.guest_vtable
+ * vcpu->arch.shadow_table
+ * vcpu->arch.shadow_vtable
+ * Does all appropriate management/bookkeeping/refcounting/etc...
+ */
+static void
+sh_detach_old_tables(struct vcpu *v)
+{
+ mfn_t smfn;
+
+ ////
+ //// vcpu->arch.guest_vtable
+ ////
+ if ( (shadow_mode_external(v->domain) || (GUEST_PAGING_LEVELS == 3)) &&
+ v->arch.guest_vtable )
+ {
+ // Q: why does this need to use (un)map_domain_page_*global* ?
+ sh_unmap_domain_page_global(v->arch.guest_vtable);
+ v->arch.guest_vtable = NULL;
+ }
+
+ ////
+ //// vcpu->arch.shadow_table
+ ////
+ smfn = pagetable_get_mfn(v->arch.shadow_table);
+ if ( mfn_x(smfn) )
+ {
+ ASSERT(v->arch.shadow_vtable);
+
+#if GUEST_PAGING_LEVELS == 3
+ // PAE guests do not (necessarily) use an entire page for their
+ // 4-entry L3s, so we have to deal with them specially.
+ //
+ sh_put_ref_l3_subshadow(v, v->arch.shadow_vtable, smfn);
+#else
+ sh_put_ref(v, smfn, 0);
+#endif
+
+#if (SHADOW_PAGING_LEVELS == 3) && (GUEST_PAGING_LEVELS == 3)
+ {
+ struct pae_l3_bookkeeping *info =
+ sl3p_to_info(v->arch.shadow_vtable);
+ ASSERT(test_bit(v->vcpu_id, &info->vcpus));
+ clear_bit(v->vcpu_id, &info->vcpus);
+ }
+#endif
+ v->arch.shadow_table = pagetable_null();
+ }
+
+ ////
+ //// vcpu->arch.shadow_vtable
+ ////
+ if ( (shadow_mode_external(v->domain) || (GUEST_PAGING_LEVELS == 3)) &&
+ v->arch.shadow_vtable )
+ {
+ // Q: why does this need to use (un)map_domain_page_*global* ?
+ //
+ sh_unmap_domain_page_global(v->arch.shadow_vtable);
+ v->arch.shadow_vtable = NULL;
+ }
+}
+
+static void
+sh_update_cr3(struct vcpu *v)
+/* Updates vcpu->arch.shadow_table after the guest has changed CR3.
+ * Paravirtual guests should set v->arch.guest_table (and guest_table_user,
+ * if appropriate).
+ * HVM guests should also set hvm_get_guest_cntl_reg(v, 3)...
+ */
+{
+ struct domain *d = v->domain;
+ mfn_t gmfn, smfn;
+#if GUEST_PAGING_LEVELS == 3
+ u32 guest_idx=0;
+#endif
+
+ ASSERT(shadow_lock_is_acquired(v->domain));
+ ASSERT(v->arch.shadow.mode);
+
+ ////
+ //// vcpu->arch.guest_table is already set
+ ////
+
+#ifndef NDEBUG
+ /* Double-check that the HVM code has sent us a sane guest_table */
+ if ( hvm_guest(v) )
+ {
+ gfn_t gfn;
+
+ ASSERT(shadow_mode_external(d));
+
+ // Is paging enabled on this vcpu?
+ if ( shadow_vcpu_mode_translate(v) )
+ {
+ gfn = _gfn(paddr_to_pfn(hvm_get_guest_ctrl_reg(v, 3)));
+ gmfn = vcpu_gfn_to_mfn(v, gfn);
+ ASSERT(valid_mfn(gmfn));
+ ASSERT(pagetable_get_pfn(v->arch.guest_table) == mfn_x(gmfn));
+ }
+ else
+ {
+ /* Paging disabled: guest_table points at (part of) p2m */
+#if SHADOW_PAGING_LEVELS != 3 /* in 3-on-4, guest-table is in slot 0 of p2m */
+ /* For everything else, they sould be the same */
+ ASSERT(v->arch.guest_table.pfn == d->arch.phys_table.pfn);
+#endif
+ }
+ }
+#endif
+
+ SHADOW_PRINTK("d=%u v=%u guest_table=%05lx\n",
+ d->domain_id, v->vcpu_id,
+ (unsigned long)pagetable_get_pfn(v->arch.guest_table));
+
+#if GUEST_PAGING_LEVELS == 4
+ if ( !(v->arch.flags & TF_kernel_mode) )
+ gmfn = pagetable_get_mfn(v->arch.guest_table_user);
+ else
+#endif
+ gmfn = pagetable_get_mfn(v->arch.guest_table);
+
+ sh_detach_old_tables(v);
+
+ if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) )
+ {
+ ASSERT(v->arch.cr3 == 0);
+ return;
+ }
+
+ ////
+ //// vcpu->arch.guest_vtable
+ ////
+ if ( shadow_mode_external(d) )
+ {
+#if GUEST_PAGING_LEVELS == 3
+ if ( shadow_vcpu_mode_translate(v) )
+ /* Paging enabled: find where in the page the l3 table is */
+ guest_idx = guest_index((void *)hvm_get_guest_ctrl_reg(v, 3));
+ else
+ /* Paging disabled: l3 is at the start of a page (in the p2m) */
+ guest_idx = 0;
+
+ // Ignore the low 2 bits of guest_idx -- they are really just
+ // cache control.
+ guest_idx &= ~3;
+ // XXX - why does this need a global map?
+ v->arch.guest_vtable =
+ (guest_l3e_t *)sh_map_domain_page_global(gmfn) + guest_idx;
+#else
+ // XXX - why does this need a global map?
+ v->arch.guest_vtable = sh_map_domain_page_global(gmfn);
+#endif
+ }
+ else
+ {
+#ifdef __x86_64__
+ v->arch.guest_vtable = __linear_l4_table;
+#elif GUEST_PAGING_LEVELS == 3
+ // XXX - why does this need a global map?
+ v->arch.guest_vtable = sh_map_domain_page_global(gmfn);
+#else
+ v->arch.guest_vtable = __linear_l2_table;
+#endif
+ }
+
+#if 0
+ printk("%s %s %d gmfn=%05lx guest_vtable=%p\n",
+ __func__, __FILE__, __LINE__, gmfn, v->arch.guest_vtable);
+#endif
+
+ ////
+ //// vcpu->arch.shadow_table
+ ////
+ smfn = get_shadow_status(v, gmfn, PGC_SH_guest_root_type);
+ if ( valid_mfn(smfn) )
+ {
+ /* Pull this root shadow to the front of the list of roots. */
+ list_del(&mfn_to_page(smfn)->list);
+ list_add(&mfn_to_page(smfn)->list, &d->arch.shadow.toplevel_shadows);
+ }
+ else
+ {
+ /* This guest MFN is a pagetable. Must revoke write access. */
+ if ( shadow_remove_write_access(v, gmfn, GUEST_PAGING_LEVELS, 0)
+ != 0 )
+ flush_tlb_mask(d->domain_dirty_cpumask);
+ /* Make sure there's enough free shadow memory. */
+ shadow_prealloc(d, SHADOW_MAX_ORDER);
+ /* Shadow the page. */
+ smfn = sh_make_shadow(v, gmfn, PGC_SH_guest_root_type);
+ list_add(&mfn_to_page(smfn)->list, &d->arch.shadow.toplevel_shadows);
+ }
+ ASSERT(valid_mfn(smfn));
+ v->arch.shadow_table = pagetable_from_mfn(smfn);
+
+#if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
+ /* Once again OK to unhook entries from this table if we see fork/exit */
+ ASSERT(sh_mfn_is_a_page_table(gmfn));
+ mfn_to_page(gmfn)->shadow_flags &= ~SHF_unhooked_mappings;
+#endif
+
+
+ ////
+ //// vcpu->arch.shadow_vtable
+ ////
+ if ( shadow_mode_external(d) )
+ {
+#if (SHADOW_PAGING_LEVELS == 3) && (GUEST_PAGING_LEVELS == 3)
+ mfn_t adjusted_smfn = smfn;
+ u32 shadow_idx = shadow_l3_index(&adjusted_smfn, guest_idx);
+ // Q: why does this need to use (un)map_domain_page_*global* ?
+ v->arch.shadow_vtable =
+ (shadow_l3e_t *)sh_map_domain_page_global(adjusted_smfn) +
+ shadow_idx;
+#else
+ // Q: why does this need to use (un)map_domain_page_*global* ?
+ v->arch.shadow_vtable = sh_map_domain_page_global(smfn);
+#endif
+ }
+ else
+ {
+#if SHADOW_PAGING_LEVELS == 4
+ v->arch.shadow_vtable = __sh_linear_l4_table;
+#elif GUEST_PAGING_LEVELS == 3
+ // XXX - why does this need a global map?
+ v->arch.shadow_vtable = sh_map_domain_page_global(smfn);
+#else
+ v->arch.shadow_vtable = __sh_linear_l2_table;
+#endif
+ }
+
+ ////
+ //// Take a ref to the new shadow table, and pin it.
+ ////
+ //
+ // This ref is logically "held" by v->arch.shadow_table entry itself.
+ // Release the old ref.
+ //
+#if GUEST_PAGING_LEVELS == 3
+ // PAE guests do not (necessarily) use an entire page for their
+ // 4-entry L3s, so we have to deal with them specially.
+ //
+ // XXX - might want to revisit this if/when we do multiple compilation for
+ // HVM-vs-PV guests, as PAE PV guests could get away without doing
+ // subshadows.
+ //
+ sh_get_ref_l3_subshadow(v->arch.shadow_vtable, smfn);
+ sh_pin_l3_subshadow(v->arch.shadow_vtable, smfn);
+#else
+ sh_get_ref(smfn, 0);
+ sh_pin(smfn);
+#endif
+
+#if (SHADOW_PAGING_LEVELS == 3) && (GUEST_PAGING_LEVELS == 3)
+ // PAE 3-on-3 shadows have to keep track of which vcpu's are using
+ // which l3 subshadow, in order handle the SHADOW_SET_L3PAE_RECOPY
+ // case from validate_gl3e(). Search for SHADOW_SET_L3PAE_RECOPY
+ // in the code for more info.
+ //
+ {
+ struct pae_l3_bookkeeping *info =
+ sl3p_to_info(v->arch.shadow_vtable);
+ ASSERT(!test_bit(v->vcpu_id, &info->vcpus));
+ set_bit(v->vcpu_id, &info->vcpus);
+ }
+#endif
+
+ debugtrace_printk("%s cr3 gmfn=%05lx smfn=%05lx\n",
+ __func__, gmfn, smfn);
+
+ ///
+ /// v->arch.cr3 and, if appropriate, v->arch.hvm_vcpu.hw_cr3
+ ///
+ if ( shadow_mode_external(d) )
+ {
+ ASSERT(hvm_guest(v));
+ make_cr3(v, pagetable_get_pfn(v->arch.monitor_table));
+
+#if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2)
+#if SHADOW_PAGING_LEVELS != 3
+#error unexpected combination of GUEST and SHADOW paging levels
+#endif
+ /* 2-on-3: make a PAE l3 table that points at the four-page l2 */
+ {
+ mfn_t smfn = pagetable_get_mfn(v->arch.shadow_table);
+ int i;
+
+ ASSERT(v->arch.hvm_vcpu.hw_cr3 ==
+ virt_to_maddr(v->arch.hvm_vcpu.hvm_lowmem_l3tab));
+ for (i = 0; i < 4; i++)
+ {
+ v->arch.hvm_vcpu.hvm_lowmem_l3tab[i] =
+ shadow_l3e_from_mfn(_mfn(mfn_x(smfn)+i), _PAGE_PRESENT);
+ }
+ }
+#elif (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3)
+ /* 3-on-3: copy the shadow l3 to slots that are below 4GB.
+ * If paging is disabled, clear l3e reserved bits; otherwise
+ * remove entries that have reserved bits set. */
+ v->arch.hvm_vcpu.hw_cr3 =
+ hvm_pae_copy_root(v, v->arch.shadow_vtable,
+ !shadow_vcpu_mode_translate(v));
+#else
+ /* 2-on-2 or 4-on-4: just put the shadow top-level into cr3 */
+ v->arch.hvm_vcpu.hw_cr3 =
+ pagetable_get_paddr(v->arch.shadow_table);
+#endif
+ }
+ else // not shadow_mode_external...
+ {
+ /* We don't support PV except guest == shadow == config levels */
+ BUG_ON(GUEST_PAGING_LEVELS != SHADOW_PAGING_LEVELS);
+ make_cr3(v, pagetable_get_pfn(v->arch.shadow_table));
+ }
+
+ /* Fix up the linear pagetable mappings */
+ sh_update_linear_entries(v);
+}
+
+
+/**************************************************************************/
+/* Functions to revoke guest rights */
+
+#if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
+static int sh_guess_wrmap(struct vcpu *v, unsigned long vaddr, mfn_t gmfn)
+/* Look up this vaddr in the current shadow and see if it's a writeable
+ * mapping of this gmfn. If so, remove it. Returns 1 if it worked. */
+{
+ shadow_l1e_t sl1e, *sl1p;
+ shadow_l2e_t *sl2p;
+#if GUEST_PAGING_LEVELS >= 3
+ shadow_l3e_t *sl3p;
+#if GUEST_PAGING_LEVELS >= 4
+ shadow_l4e_t *sl4p;
+#endif
+#endif
+ mfn_t sl1mfn;
+
+
+ /* Carefully look in the shadow linear map for the l1e we expect */
+ if ( v->arch.shadow_vtable == NULL ) return 0;
+#if GUEST_PAGING_LEVELS >= 4
+ sl4p = sh_linear_l4_table(v) + shadow_l4_linear_offset(vaddr);
+ if ( !(shadow_l4e_get_flags(*sl4p) & _PAGE_PRESENT) )
+ return 0;
+ sl3p = sh_linear_l3_table(v) + shadow_l3_linear_offset(vaddr);
+ if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
+ return 0;
+#elif GUEST_PAGING_LEVELS == 3
+ sl3p = ((shadow_l3e_t *) v->arch.shadow_vtable)
+ + shadow_l3_linear_offset(vaddr);
+ if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
+ return 0;
+#endif
+ sl2p = sh_linear_l2_table(v) + shadow_l2_linear_offset(vaddr);
+ if ( !(shadow_l2e_get_flags(*sl2p) & _PAGE_PRESENT) )
+ return 0;
+ sl1p = sh_linear_l1_table(v) + shadow_l1_linear_offset(vaddr);
+ sl1e = *sl1p;
+ if ( ((shadow_l1e_get_flags(sl1e) & (_PAGE_PRESENT|_PAGE_RW))
+ != (_PAGE_PRESENT|_PAGE_RW))
+ || (mfn_x(shadow_l1e_get_mfn(sl1e)) != mfn_x(gmfn)) )
+ return 0;
+
+ /* Found it! Need to remove its write permissions. */
+ sl1mfn = shadow_l2e_get_mfn(*sl2p);
+ sl1e = shadow_l1e_remove_flags(sl1e, _PAGE_RW);
+ shadow_set_l1e(v, sl1p, sl1e, sl1mfn);
+ return 1;
+}
+#endif
+
+int sh_remove_write_access(struct vcpu *v, mfn_t sl1mfn, mfn_t readonly_mfn)
+/* Excises all writeable mappings to readonly_mfn from this l1 shadow table */
+{
+ shadow_l1e_t *sl1e;
+ int done = 0;
+ int flags;
+
+ SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done,
+ {
+ flags = shadow_l1e_get_flags(*sl1e);
+ if ( (flags & _PAGE_PRESENT)
+ && (flags & _PAGE_RW)
+ && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(readonly_mfn)) )
+ {
+ shadow_set_l1e(v, sl1e, shadow_l1e_empty(), sl1mfn);
+ if ( (mfn_to_page(readonly_mfn)->u.inuse.type_info
+ & PGT_count_mask) == 0 )
+ /* This breaks us cleanly out of the FOREACH macro */
+ done = 1;
+ }
+ });
+ return done;
+}
+
+
+int sh_remove_all_mappings(struct vcpu *v, mfn_t sl1mfn, mfn_t target_mfn)
+/* Excises all mappings to guest frame from this shadow l1 table */
+{
+ shadow_l1e_t *sl1e;
+ int done = 0;
+ int flags;
+
+ SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done,
+ {
+ flags = shadow_l1e_get_flags(*sl1e);
+ if ( (flags & _PAGE_PRESENT)
+ && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(target_mfn)) )
+ {
+ shadow_set_l1e(v, sl1e, shadow_l1e_empty(), sl1mfn);
+ if ( (mfn_to_page(target_mfn)->count_info & PGC_count_mask) == 0 )
+ /* This breaks us cleanly out of the FOREACH macro */
+ done = 1;
+ }
+ });
+ return done;
+}
+
+/**************************************************************************/
+/* Functions to excise all pointers to shadows from higher-level shadows. */
+
+void sh_clear_shadow_entry(struct vcpu *v, void *ep, mfn_t smfn)
+/* Blank out a single shadow entry */
+{
+ switch (mfn_to_page(smfn)->count_info & PGC_SH_type_mask)
+ {
+ case PGC_SH_l1_shadow:
+ shadow_set_l1e(v, ep, shadow_l1e_empty(), smfn); break;
+ case PGC_SH_l2_shadow:
+#if GUEST_PAGING_LEVELS == 3
+ case PGC_SH_l2h_shadow:
+#endif
+ shadow_set_l2e(v, ep, shadow_l2e_empty(), smfn); break;
+#if GUEST_PAGING_LEVELS >= 3
+ case PGC_SH_l3_shadow:
+ shadow_set_l3e(v, ep, shadow_l3e_empty(), smfn); break;
+#if GUEST_PAGING_LEVELS >= 4
+ case PGC_SH_l4_shadow:
+ shadow_set_l4e(v, ep, shadow_l4e_empty(), smfn); break;
+#endif
+#endif
+ default: BUG(); /* Called with the wrong kind of shadow. */
+ }
+}
+
+int sh_remove_l1_shadow(struct vcpu *v, mfn_t sl2mfn, mfn_t sl1mfn)
+/* Remove all mappings of this l1 shadow from this l2 shadow */
+{
+ shadow_l2e_t *sl2e;
+ int done = 0;
+ int flags;
+#if GUEST_PAGING_LEVELS != 4
+ int xen_mappings = !shadow_mode_external(v->domain);
+#endif
+
+ SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, done, xen_mappings,
+ {
+ flags = shadow_l2e_get_flags(*sl2e);
+ if ( (flags & _PAGE_PRESENT)
+ && (mfn_x(shadow_l2e_get_mfn(*sl2e)) == mfn_x(sl1mfn)) )
+ {
+ shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
+ if ( (mfn_to_page(sl1mfn)->count_info & PGC_SH_type_mask) == 0 )
+ /* This breaks us cleanly out of the FOREACH macro */
+ done = 1;
+ }
+ });
+ return done;
+}
+
+#if GUEST_PAGING_LEVELS >= 3
+int sh_remove_l2_shadow(struct vcpu *v, mfn_t sl3mfn, mfn_t sl2mfn)
+/* Remove all mappings of this l2 shadow from this l3 shadow */
+{
+ shadow_l3e_t *sl3e;
+ int done = 0;
+ int flags;
+
+ SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, done,
+ {
+ flags = shadow_l3e_get_flags(*sl3e);
+ if ( (flags & _PAGE_PRESENT)
+ && (mfn_x(shadow_l3e_get_mfn(*sl3e)) == mfn_x(sl2mfn)) )
+ {
+ shadow_set_l3e(v, sl3e, shadow_l3e_empty(), sl3mfn);
+ if ( (mfn_to_page(sl2mfn)->count_info & PGC_SH_type_mask) == 0 )
+ /* This breaks us cleanly out of the FOREACH macro */
+ done = 1;
+ }
+ });
+ return done;
+}
+
+#if GUEST_PAGING_LEVELS >= 4
+int sh_remove_l3_shadow(struct vcpu *v, mfn_t sl4mfn, mfn_t sl3mfn)
+/* Remove all mappings of this l3 shadow from this l4 shadow */
+{
+ shadow_l4e_t *sl4e;
+ int done = 0;
+ int flags, xen_mappings = !shadow_mode_external(v->domain);
+
+ SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, done, xen_mappings,
+ {
+ flags = shadow_l4e_get_flags(*sl4e);
+ if ( (flags & _PAGE_PRESENT)
+ && (mfn_x(shadow_l4e_get_mfn(*sl4e)) == mfn_x(sl3mfn)) )
+ {
+ shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
+ if ( (mfn_to_page(sl3mfn)->count_info & PGC_SH_type_mask) == 0 )
+ /* This breaks us cleanly out of the FOREACH macro */
+ done = 1;
+ }
+ });
+ return done;
+}
+#endif /* 64bit guest */
+#endif /* PAE guest */
+
+/**************************************************************************/
+/* Handling HVM guest writes to pagetables */
+
+/* Check that the user is allowed to perform this write.
+ * Returns a mapped pointer to write to, and the mfn it's on,
+ * or NULL for error. */
+static inline void * emulate_map_dest(struct vcpu *v,
+ unsigned long vaddr,
+ struct x86_emulate_ctxt *ctxt,
+ mfn_t *mfnp)
+{
+ walk_t gw;
+ u32 flags;
+ gfn_t gfn;
+ mfn_t mfn;
+
+ guest_walk_tables(v, vaddr, &gw, 1);
+ flags = accumulate_guest_flags(&gw);
+ gfn = guest_l1e_get_gfn(gw.eff_l1e);
+ mfn = vcpu_gfn_to_mfn(v, gfn);
+ sh_audit_gw(v, &gw);
+ unmap_walk(v, &gw);
+
+ if ( !(flags & _PAGE_PRESENT)
+ || !(flags & _PAGE_RW)
+ || (!(flags & _PAGE_USER) && ring_3(ctxt->regs)) )
+ {
+ /* This write would have faulted even on bare metal */
+ v->arch.shadow.propagate_fault = 1;
+ return NULL;
+ }
+
+ if ( !valid_mfn(mfn) )
+ {
+ /* Attempted a write to a bad gfn. This should never happen:
+ * after all, we're here because this write is to a page table. */
+ BUG();
+ }
+
+ ASSERT(sh_mfn_is_a_page_table(mfn));
+ *mfnp = mfn;
+ return sh_map_domain_page(mfn) + (vaddr & ~PAGE_MASK);
+}
+
+int
+sh_x86_emulate_write(struct vcpu *v, unsigned long vaddr, void *src,
+ u32 bytes, struct x86_emulate_ctxt *ctxt)
+{
+ ASSERT(shadow_lock_is_acquired(v->domain));
+ while ( bytes > 0 )
+ {
+ mfn_t mfn;
+ int bytes_on_page;
+ void *addr;
+
+ bytes_on_page = PAGE_SIZE - (vaddr & ~PAGE_MASK);
+ if ( bytes_on_page > bytes )
+ bytes_on_page = bytes;
+
+ if ( (addr = emulate_map_dest(v, vaddr, ctxt, &mfn)) == NULL )
+ return X86EMUL_PROPAGATE_FAULT;
+ memcpy(addr, src, bytes_on_page);
+ shadow_validate_guest_pt_write(v, mfn, addr, bytes_on_page);
+ bytes -= bytes_on_page;
+ /* If we are writing zeros to this page, might want to unshadow */
+ if ( *(u8 *)addr == 0 )
+ check_for_early_unshadow(v, mfn);
+ sh_unmap_domain_page(addr);
+ }
+ shadow_audit_tables(v);
+ return X86EMUL_CONTINUE;
+}
+
+int
+sh_x86_emulate_cmpxchg(struct vcpu *v, unsigned long vaddr,
+ unsigned long old, unsigned long new,
+ unsigned int bytes, struct x86_emulate_ctxt *ctxt)
+{
+ mfn_t mfn;
+ void *addr;
+ unsigned long prev;
+ int rv = X86EMUL_CONTINUE;
+
+ ASSERT(shadow_lock_is_acquired(v->domain));
+ ASSERT(bytes <= sizeof (unsigned long));
+
+ if ( (addr = emulate_map_dest(v, vaddr, ctxt, &mfn)) == NULL )
+ return X86EMUL_PROPAGATE_FAULT;
+
+ switch (bytes)
+ {
+ case 1: prev = cmpxchg(((u8 *)addr), old, new); break;
+ case 2: prev = cmpxchg(((u16 *)addr), old, new); break;
+ case 4: prev = cmpxchg(((u32 *)addr), old, new); break;
+ case 8: prev = cmpxchg(((u64 *)addr), old, new); break;
+ default:
+ SHADOW_PRINTK("cmpxchg of size %i is not supported\n", bytes);
+ prev = ~old;
+ }
+
+ if ( (prev == old) )
+ shadow_validate_guest_pt_write(v, mfn, addr, bytes);
+ else
+ rv = X86EMUL_CMPXCHG_FAILED;
+
+ SHADOW_DEBUG(EMULATE, "va %#lx was %#lx expected %#lx"
+ " wanted %#lx now %#lx bytes %u\n",
+ vaddr, prev, old, new, *(unsigned long *)addr, bytes);
+
+ /* If we are writing zeros to this page, might want to unshadow */
+ if ( *(u8 *)addr == 0 )
+ check_for_early_unshadow(v, mfn);
+
+ sh_unmap_domain_page(addr);
+ shadow_audit_tables(v);
+ check_for_early_unshadow(v, mfn);
+ return rv;
+}
+
+int
+sh_x86_emulate_cmpxchg8b(struct vcpu *v, unsigned long vaddr,
+ unsigned long old_lo, unsigned long old_hi,
+ unsigned long new_lo, unsigned long new_hi,
+ struct x86_emulate_ctxt *ctxt)
+{
+ mfn_t mfn;
+ void *addr;
+ u64 old, new, prev;
+ int rv = X86EMUL_CONTINUE;
+
+ ASSERT(shadow_lock_is_acquired(v->domain));
+
+ if ( (addr = emulate_map_dest(v, vaddr, ctxt, &mfn)) == NULL )
+ return X86EMUL_PROPAGATE_FAULT;
+
+ old = (((u64) old_hi) << 32) | (u64) old_lo;
+ new = (((u64) new_hi) << 32) | (u64) new_lo;
+ prev = cmpxchg(((u64 *)addr), old, new);
+
+ if ( (prev == old) )
+ shadow_validate_guest_pt_write(v, mfn, addr, 8);
+ else
+ rv = X86EMUL_CMPXCHG_FAILED;
+
+ /* If we are writing zeros to this page, might want to unshadow */
+ if ( *(u8 *)addr == 0 )
+ check_for_early_unshadow(v, mfn);
+
+ sh_unmap_domain_page(addr);
+ shadow_audit_tables(v);
+ check_for_early_unshadow(v, mfn);
+ return rv;
+}
+
+
+/**************************************************************************/
+/* Audit tools */
+
+#if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
+
+#define AUDIT_FAIL(_level, _fmt, _a...) do { \
+ printk("Shadow %u-on-%u audit failed at level %i, index %i\n" \
+ "gl" #_level "mfn = %" SH_PRI_mfn \
+ " sl" #_level "mfn = %" SH_PRI_mfn \
+ " &gl" #_level "e = %p &sl" #_level "e = %p" \
+ " gl" #_level "e = %" SH_PRI_gpte \
+ " sl" #_level "e = %" SH_PRI_pte "\nError: " _fmt "\n", \
+ GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \
+ _level, guest_index(gl ## _level ## e), \
+ mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \
+ gl ## _level ## e, sl ## _level ## e, \
+ gl ## _level ## e->l ## _level, sl ## _level ## e->l ## _level, \
+ ##_a); \
+ BUG(); \
+ done = 1; \
+} while (0)
+
+
+static char * sh_audit_flags(struct vcpu *v, int level,
+ int gflags, int sflags)
+/* Common code for auditing flag bits */
+{
+ if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_PRESENT) )
+ return "shadow is present but guest is not present";
+ if ( (sflags & _PAGE_GLOBAL) && !hvm_guest(v) )
+ return "global bit set in PV shadow";
+ if ( (level == 1 || (level == 2 && (gflags & _PAGE_PSE)))
+ && ((sflags & _PAGE_DIRTY) && !(gflags & _PAGE_DIRTY)) )
+ return "dirty bit not propagated";
+ if ( level == 2 && (sflags & _PAGE_PSE) )
+ return "PS bit set in shadow";
+#if SHADOW_PAGING_LEVELS == 3
+ if ( level == 3 ) return NULL; /* All the other bits are blank in PAEl3 */
+#endif
+ if ( (sflags & _PAGE_USER) != (gflags & _PAGE_USER) )
+ return "user/supervisor bit does not match";
+ if ( (sflags & _PAGE_NX_BIT) != (gflags & _PAGE_NX_BIT) )
+ return "NX bit does not match";
+ if ( (sflags & _PAGE_RW) && !(gflags & _PAGE_RW) )
+ return "shadow grants write access but guest does not";
+ if ( (sflags & _PAGE_ACCESSED) && !(gflags & _PAGE_ACCESSED) )
+ return "accessed bit not propagated";
+ return NULL;
+}
+
+static inline mfn_t
+audit_gfn_to_mfn(struct vcpu *v, gfn_t gfn, mfn_t gmfn)
+/* Convert this gfn to an mfn in the manner appropriate for the
+ * guest pagetable it's used in (gmfn) */
+{
+ if ( !shadow_mode_translate(v->domain) )
+ return _mfn(gfn_x(gfn));
+
+ if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_type_mask)
+ != PGT_writable_page )
+ return _mfn(gfn_x(gfn)); /* This is a paging-disabled shadow */
+ else
+ return sh_gfn_to_mfn(v->domain, gfn_x(gfn));
+}
+
+
+int sh_audit_l1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
+{
+ guest_l1e_t *gl1e, *gp;
+ shadow_l1e_t *sl1e;
+ mfn_t mfn, gmfn, gl1mfn;
+ gfn_t gfn;
+ char *s;
+ int done = 0;
+
+ /* Follow the backpointer */
+ gl1mfn = _mfn(mfn_to_page(sl1mfn)->u.inuse.type_info);
+ gl1e = gp = sh_map_domain_page(gl1mfn);
+ SHADOW_FOREACH_L1E(sl1mfn, sl1e, &gl1e, done, {
+
+ s = sh_audit_flags(v, 1, guest_l1e_get_flags(*gl1e),
+ shadow_l1e_get_flags(*sl1e));
+ if ( s ) AUDIT_FAIL(1, "%s", s);
+
+ if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
+ {
+ gfn = guest_l1e_get_gfn(*gl1e);
+ mfn = shadow_l1e_get_mfn(*sl1e);
+ gmfn = audit_gfn_to_mfn(v, gfn, gl1mfn);
+ if ( mfn_x(gmfn) != mfn_x(mfn) )
+ AUDIT_FAIL(1, "bad translation: gfn %" SH_PRI_gfn
+ " --> %" SH_PRI_mfn " != mfn %" SH_PRI_mfn "\n",
+ gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
+ }
+ });
+ sh_unmap_domain_page(gp);
+ return done;
+}
+
+int sh_audit_fl1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
+{
+ guest_l1e_t *gl1e, e;
+ shadow_l1e_t *sl1e;
+ mfn_t gl1mfn = _mfn(INVALID_MFN);
+ int f;
+ int done = 0;
+
+ /* fl1 has no useful backpointer: all we can check are flags */
+ e = guest_l1e_from_gfn(_gfn(0), 0); gl1e = &e; /* Needed for macro */
+ SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done, {
+ f = shadow_l1e_get_flags(*sl1e);
+ f &= ~(_PAGE_AVAIL0|_PAGE_AVAIL1|_PAGE_AVAIL2);
+ if ( !(f == 0
+ || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
+ _PAGE_ACCESSED|_PAGE_DIRTY)
+ || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_ACCESSED|_PAGE_DIRTY)) )
+ AUDIT_FAIL(1, "fl1e has bad flags");
+ });
+ return 0;
+}
+
+int sh_audit_l2_table(struct vcpu *v, mfn_t sl2mfn, mfn_t x)
+{
+ guest_l2e_t *gl2e, *gp;
+ shadow_l2e_t *sl2e;
+ mfn_t mfn, gmfn, gl2mfn;
+ gfn_t gfn;
+ char *s;
+ int done = 0;
+#if GUEST_PAGING_LEVELS != 4
+ int xen_mappings = !shadow_mode_external(v->domain);
+#endif
+
+ /* Follow the backpointer */
+ gl2mfn = _mfn(mfn_to_page(sl2mfn)->u.inuse.type_info);
+ gl2e = gp = sh_map_domain_page(gl2mfn);
+ SHADOW_FOREACH_L2E(sl2mfn, sl2e, &gl2e, done, xen_mappings, {
+
+ s = sh_audit_flags(v, 2, guest_l2e_get_flags(*gl2e),
+ shadow_l2e_get_flags(*sl2e));
+ if ( s ) AUDIT_FAIL(2, "%s", s);
+
+ if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
+ {
+ gfn = guest_l2e_get_gfn(*gl2e);
+ mfn = shadow_l2e_get_mfn(*sl2e);
+ gmfn = (guest_l2e_get_flags(*gl2e) & _PAGE_PSE)
+ ? get_fl1_shadow_status(v, gfn)
+ : get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl2mfn),
+ PGC_SH_l1_shadow);
+ if ( mfn_x(gmfn) != mfn_x(mfn) )
+ AUDIT_FAIL(2, "bad translation: gfn %" SH_PRI_gfn
+ " (--> %" SH_PRI_mfn ")"
+ " --> %" SH_PRI_mfn " != mfn %" SH_PRI_mfn "\n",
+ gfn_x(gfn),
+ (guest_l2e_get_flags(*gl2e) & _PAGE_PSE) ? 0
+ : mfn_x(audit_gfn_to_mfn(v, gfn, gl2mfn)),
+ mfn_x(gmfn), mfn_x(mfn));
+ }
+ });
+ sh_unmap_domain_page(gp);
+ return 0;
+}
+
+#if GUEST_PAGING_LEVELS >= 3
+int sh_audit_l3_table(struct vcpu *v, mfn_t sl3mfn, mfn_t x)
+{
+ guest_l3e_t *gl3e, *gp;
+ shadow_l3e_t *sl3e;
+ mfn_t mfn, gmfn, gl3mfn;
+ gfn_t gfn;
+ char *s;
+ int done = 0;
+
+ /* Follow the backpointer */
+ gl3mfn = _mfn(mfn_to_page(sl3mfn)->u.inuse.type_info);
+ gl3e = gp = sh_map_domain_page(gl3mfn);
+ SHADOW_FOREACH_L3E(sl3mfn, sl3e, &gl3e, done, {
+
+ s = sh_audit_flags(v, 3, guest_l3e_get_flags(*gl3e),
+ shadow_l3e_get_flags(*sl3e));
+ if ( s ) AUDIT_FAIL(3, "%s", s);
+
+ if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
+ {
+ gfn = guest_l3e_get_gfn(*gl3e);
+ mfn = shadow_l3e_get_mfn(*sl3e);
+ gmfn = get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl3mfn),
+ (GUEST_PAGING_LEVELS == 3
+ && !shadow_mode_external(v->domain)
+ && (guest_index(gl3e) % 4) == 3)
+ ? PGC_SH_l2h_pae_shadow
+ : PGC_SH_l2_shadow);
+ if ( mfn_x(gmfn) != mfn_x(mfn) )
+ AUDIT_FAIL(3, "bad translation: gfn %" SH_PRI_gfn
+ " --> %" SH_PRI_mfn " != mfn %" SH_PRI_mfn "\n",
+ gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
+ }
+ });
+ sh_unmap_domain_page(gp);
+ return 0;
+}
+#endif /* GUEST_PAGING_LEVELS >= 3 */
+
+#if GUEST_PAGING_LEVELS >= 4
+int sh_audit_l4_table(struct vcpu *v, mfn_t sl4mfn, mfn_t x)
+{
+ guest_l4e_t *gl4e, *gp;
+ shadow_l4e_t *sl4e;
+ mfn_t mfn, gmfn, gl4mfn;
+ gfn_t gfn;
+ char *s;
+ int done = 0;
+ int xen_mappings = !shadow_mode_external(v->domain);
+
+ /* Follow the backpointer */
+ gl4mfn = _mfn(mfn_to_page(sl4mfn)->u.inuse.type_info);
+ gl4e = gp = sh_map_domain_page(gl4mfn);
+ SHADOW_FOREACH_L4E(sl4mfn, sl4e, &gl4e, done, xen_mappings,
+ {
+ s = sh_audit_flags(v, 4, guest_l4e_get_flags(*gl4e),
+ shadow_l4e_get_flags(*sl4e));
+ if ( s ) AUDIT_FAIL(4, "%s", s);
+
+ if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
+ {
+ gfn = guest_l4e_get_gfn(*gl4e);
+ mfn = shadow_l4e_get_mfn(*sl4e);
+ gmfn = get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl4mfn),
+ PGC_SH_l3_shadow);
+ if ( mfn_x(gmfn) != mfn_x(mfn) )
+ AUDIT_FAIL(4, "bad translation: gfn %" SH_PRI_gfn
+ " --> %" SH_PRI_mfn " != mfn %" SH_PRI_mfn "\n",
+ gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
+ }
+ });
+ sh_unmap_domain_page(gp);
+ return 0;
+}
+#endif /* GUEST_PAGING_LEVELS >= 4 */
+
+
+#undef AUDIT_FAIL
+
+#endif /* Audit code */
+
+/**************************************************************************/
+/* Entry points into this mode of the shadow code.
+ * This will all be mangled by the preprocessor to uniquify everything. */
+struct shadow_paging_mode sh_paging_mode = {
+ .page_fault = sh_page_fault,
+ .invlpg = sh_invlpg,
+ .gva_to_gpa = sh_gva_to_gpa,
+ .gva_to_gfn = sh_gva_to_gfn,
+ .update_cr3 = sh_update_cr3,
+ .map_and_validate_gl1e = sh_map_and_validate_gl1e,
+ .map_and_validate_gl2e = sh_map_and_validate_gl2e,
+ .map_and_validate_gl2he = sh_map_and_validate_gl2he,
+ .map_and_validate_gl3e = sh_map_and_validate_gl3e,
+ .map_and_validate_gl4e = sh_map_and_validate_gl4e,
+ .detach_old_tables = sh_detach_old_tables,
+ .x86_emulate_write = sh_x86_emulate_write,
+ .x86_emulate_cmpxchg = sh_x86_emulate_cmpxchg,
+ .x86_emulate_cmpxchg8b = sh_x86_emulate_cmpxchg8b,
+ .make_monitor_table = sh_make_monitor_table,
+ .destroy_monitor_table = sh_destroy_monitor_table,
+#if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
+ .guess_wrmap = sh_guess_wrmap,
+#endif
+ .guest_levels = GUEST_PAGING_LEVELS,
+ .shadow_levels = SHADOW_PAGING_LEVELS,
+};
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+/******************************************************************************
+ * arch/x86/mm/shadow/multi.h
+ *
+ * Shadow declarations which will be multiply compiled.
+ * Parts of this code are Copyright (c) 2006 by XenSource Inc.
+ * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
+ * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+extern int
+SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, SHADOW_LEVELS, GUEST_LEVELS)(
+ struct vcpu *v, mfn_t gl1mfn, void *new_gl1p, u32 size);
+extern int
+SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, SHADOW_LEVELS, GUEST_LEVELS)(
+ struct vcpu *v, mfn_t gl2mfn, void *new_gl2p, u32 size);
+extern int
+SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2he, SHADOW_LEVELS, GUEST_LEVELS)(
+ struct vcpu *v, mfn_t gl2mfn, void *new_gl2p, u32 size);
+extern int
+SHADOW_INTERNAL_NAME(sh_map_and_validate_gl3e, SHADOW_LEVELS, GUEST_LEVELS)(
+ struct vcpu *v, mfn_t gl3mfn, void *new_gl3p, u32 size);
+extern int
+SHADOW_INTERNAL_NAME(sh_map_and_validate_gl4e, SHADOW_LEVELS, GUEST_LEVELS)(
+ struct vcpu *v, mfn_t gl4mfn, void *new_gl4p, u32 size);
+
+extern void
+SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, SHADOW_LEVELS, GUEST_LEVELS)(
+ struct vcpu *v, mfn_t smfn);
+extern void
+SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, SHADOW_LEVELS, GUEST_LEVELS)(
+ struct vcpu *v, mfn_t smfn);
+extern void
+SHADOW_INTERNAL_NAME(sh_destroy_l3_shadow, SHADOW_LEVELS, GUEST_LEVELS)(
+ struct vcpu *v, mfn_t smfn);
+extern void
+SHADOW_INTERNAL_NAME(sh_destroy_l4_shadow, SHADOW_LEVELS, GUEST_LEVELS)(
+ struct vcpu *v, mfn_t smfn);
+
+extern void
+SHADOW_INTERNAL_NAME(sh_unpin_all_l3_subshadows, 3, 3)
+ (struct vcpu *v, mfn_t smfn);
+
+extern void
+SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings, SHADOW_LEVELS, GUEST_LEVELS)
+ (struct vcpu *v, mfn_t sl2mfn);
+extern void
+SHADOW_INTERNAL_NAME(sh_unhook_pae_mappings, SHADOW_LEVELS, GUEST_LEVELS)
+ (struct vcpu *v, mfn_t sl3mfn);
+extern void
+SHADOW_INTERNAL_NAME(sh_unhook_64b_mappings, SHADOW_LEVELS, GUEST_LEVELS)
+ (struct vcpu *v, mfn_t sl4mfn);
+
+extern int
+SHADOW_INTERNAL_NAME(sh_remove_write_access, SHADOW_LEVELS, GUEST_LEVELS)
+ (struct vcpu *v, mfn_t sl1mfn, mfn_t readonly_mfn);
+extern int
+SHADOW_INTERNAL_NAME(sh_remove_all_mappings, SHADOW_LEVELS, GUEST_LEVELS)
+ (struct vcpu *v, mfn_t sl1mfn, mfn_t target_mfn);
+
+extern void
+SHADOW_INTERNAL_NAME(sh_clear_shadow_entry, SHADOW_LEVELS, GUEST_LEVELS)
+ (struct vcpu *v, void *ep, mfn_t smfn);
+
+extern int
+SHADOW_INTERNAL_NAME(sh_remove_l1_shadow, SHADOW_LEVELS, GUEST_LEVELS)
+ (struct vcpu *v, mfn_t sl2mfn, mfn_t sl1mfn);
+extern int
+SHADOW_INTERNAL_NAME(sh_remove_l2_shadow, SHADOW_LEVELS, GUEST_LEVELS)
+ (struct vcpu *v, mfn_t sl3mfn, mfn_t sl2mfn);
+extern int
+SHADOW_INTERNAL_NAME(sh_remove_l3_shadow, SHADOW_LEVELS, GUEST_LEVELS)
+ (struct vcpu *v, mfn_t sl4mfn, mfn_t sl3mfn);
+
+#if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
+int
+SHADOW_INTERNAL_NAME(sh_audit_l1_table, SHADOW_LEVELS, GUEST_LEVELS)
+ (struct vcpu *v, mfn_t sl1mfn, mfn_t x);
+int
+SHADOW_INTERNAL_NAME(sh_audit_fl1_table, SHADOW_LEVELS, GUEST_LEVELS)
+ (struct vcpu *v, mfn_t sl1mfn, mfn_t x);
+int
+SHADOW_INTERNAL_NAME(sh_audit_l2_table, SHADOW_LEVELS, GUEST_LEVELS)
+ (struct vcpu *v, mfn_t sl2mfn, mfn_t x);
+int
+SHADOW_INTERNAL_NAME(sh_audit_l3_table, SHADOW_LEVELS, GUEST_LEVELS)
+ (struct vcpu *v, mfn_t sl3mfn, mfn_t x);
+int
+SHADOW_INTERNAL_NAME(sh_audit_l4_table, SHADOW_LEVELS, GUEST_LEVELS)
+ (struct vcpu *v, mfn_t sl4mfn, mfn_t x);
+#endif
+
+#if SHADOW_LEVELS == GUEST_LEVELS
+extern mfn_t
+SHADOW_INTERNAL_NAME(sh_make_monitor_table, SHADOW_LEVELS, GUEST_LEVELS)
+ (struct vcpu *v);
+extern void
+SHADOW_INTERNAL_NAME(sh_destroy_monitor_table, SHADOW_LEVELS, GUEST_LEVELS)
+ (struct vcpu *v, mfn_t mmfn);
+#endif
+
+extern struct shadow_paging_mode
+SHADOW_INTERNAL_NAME(sh_paging_mode, SHADOW_LEVELS, GUEST_LEVELS);
--- /dev/null
+
+#ifndef __X86_PAGE_GUEST_H__
+#define __X86_PAGE_GUEST_H__
+
+#ifndef __ASSEMBLY__
+# include <asm/types.h>
+#endif
+
+#define PAGETABLE_ORDER_32 10
+#define L1_PAGETABLE_ENTRIES_32 (1<<PAGETABLE_ORDER_32)
+#define L2_PAGETABLE_ENTRIES_32 (1<<PAGETABLE_ORDER_32)
+#define ROOT_PAGETABLE_ENTRIES_32 L2_PAGETABLE_ENTRIES_32
+
+
+#define L1_PAGETABLE_SHIFT_32 12
+#define L2_PAGETABLE_SHIFT_32 22
+
+/* Extract flags into 12-bit integer, or turn 12-bit flags into a pte mask. */
+
+#ifndef __ASSEMBLY__
+
+typedef u32 intpte_32_t;
+
+typedef struct { intpte_32_t l1; } l1_pgentry_32_t;
+typedef struct { intpte_32_t l2; } l2_pgentry_32_t;
+typedef l2_pgentry_t root_pgentry_32_t;
+#endif
+
+#define get_pte_flags_32(x) ((u32)(x) & 0xFFF)
+#define put_pte_flags_32(x) ((intpte_32_t)(x))
+
+/* Get pte access flags (unsigned int). */
+#define l1e_get_flags_32(x) (get_pte_flags_32((x).l1))
+#define l2e_get_flags_32(x) (get_pte_flags_32((x).l2))
+
+#define l1e_get_paddr_32(x) \
+ ((paddr_t)(((x).l1 & (PADDR_MASK&PAGE_MASK))))
+#define l2e_get_paddr_32(x) \
+ ((paddr_t)(((x).l2 & (PADDR_MASK&PAGE_MASK))))
+
+/* Construct an empty pte. */
+#define l1e_empty_32() ((l1_pgentry_32_t) { 0 })
+#define l2e_empty_32() ((l2_pgentry_32_t) { 0 })
+
+/* Construct a pte from a pfn and access flags. */
+#define l1e_from_pfn_32(pfn, flags) \
+ ((l1_pgentry_32_t) { ((intpte_32_t)(pfn) << PAGE_SHIFT) | put_pte_flags_32(flags) })
+#define l2e_from_pfn_32(pfn, flags) \
+ ((l2_pgentry_32_t) { ((intpte_32_t)(pfn) << PAGE_SHIFT) | put_pte_flags_32(flags) })
+
+/* Construct a pte from a physical address and access flags. */
+#ifndef __ASSEMBLY__
+static inline l1_pgentry_32_t l1e_from_paddr_32(paddr_t pa, unsigned int flags)
+{
+ ASSERT((pa & ~(PADDR_MASK & PAGE_MASK)) == 0);
+ return (l1_pgentry_32_t) { pa | put_pte_flags_32(flags) };
+}
+static inline l2_pgentry_32_t l2e_from_paddr_32(paddr_t pa, unsigned int flags)
+{
+ ASSERT((pa & ~(PADDR_MASK & PAGE_MASK)) == 0);
+ return (l2_pgentry_32_t) { pa | put_pte_flags_32(flags) };
+}
+#endif /* !__ASSEMBLY__ */
+
+
+/* Construct a pte from a page pointer and access flags. */
+#define l1e_from_page_32(page, flags) (l1e_from_pfn_32(page_to_mfn(page),(flags)))
+#define l2e_from_page_32(page, flags) (l2e_from_pfn_32(page_to_mfn(page),(flags)))
+
+/* Add extra flags to an existing pte. */
+#define l1e_add_flags_32(x, flags) ((x).l1 |= put_pte_flags_32(flags))
+#define l2e_add_flags_32(x, flags) ((x).l2 |= put_pte_flags_32(flags))
+
+/* Remove flags from an existing pte. */
+#define l1e_remove_flags_32(x, flags) ((x).l1 &= ~put_pte_flags_32(flags))
+#define l2e_remove_flags_32(x, flags) ((x).l2 &= ~put_pte_flags_32(flags))
+
+/* Check if a pte's page mapping or significant access flags have changed. */
+#define l1e_has_changed_32(x,y,flags) \
+ ( !!(((x).l1 ^ (y).l1) & ((PADDR_MASK&PAGE_MASK)|put_pte_flags_32(flags))) )
+#define l2e_has_changed_32(x,y,flags) \
+ ( !!(((x).l2 ^ (y).l2) & ((PADDR_MASK&PAGE_MASK)|put_pte_flags_32(flags))) )
+
+/* Given a virtual address, get an entry offset into a page table. */
+#define l1_table_offset_32(a) \
+ (((a) >> L1_PAGETABLE_SHIFT_32) & (L1_PAGETABLE_ENTRIES_32 - 1))
+#define l2_table_offset_32(a) \
+ (((a) >> L2_PAGETABLE_SHIFT_32) & (L2_PAGETABLE_ENTRIES_32 - 1))
+
+#define linear_l1_table_32 \
+ ((l1_pgentry_32_t *)(LINEAR_PT_VIRT_START))
+
+#define linear_pg_table_32 linear_l1_table_32
+
+#endif /* __X86_PAGE_GUEST_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+/******************************************************************************
+ * arch/x86/mm/shadow/private.h
+ *
+ * Shadow code that is private, and does not need to be multiply compiled.
+ * Parts of this code are Copyright (c) 2006 by XenSource Inc.
+ * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
+ * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef _XEN_SHADOW_PRIVATE_H
+#define _XEN_SHADOW_PRIVATE_H
+
+// In order to override the definition of mfn_to_page, we make sure page.h has
+// been included...
+#include <asm/page.h>
+#include <xen/domain_page.h>
+#include <asm/x86_emulate.h>
+#include <asm/hvm/support.h>
+
+
+/******************************************************************************
+ * Definitions for the use of the "available" bits in the shadow PTEs.
+ *
+ * Review of the low 12 bits of a shadow page table entry:
+ *
+ * in a guest: in a shadow:
+ * Bit 11: _PAGE_AVAIL2, aka _PAGE_GNTTAB
+ * Bit 10: _PAGE_AVAIL1 _PAGE_SHADOW_RW ("SW" below)
+ * Bit 9: _PAGE_AVAIL0 _PAGE_SHADOW_PRESENT ("SP" below)
+ * Bit 8: _PAGE_GLOBAL _PAGE_SHADOW_MMIO ("MMIO" below),
+ * aka _PAGE_SHADOW_GUEST_NOT_PRESENT
+ * Bit 7: _PAGE_PSE, aka _PAGE_PAT
+ * Bit 6: _PAGE_DIRTY
+ * Bit 5: _PAGE_ACCESSED
+ * Bit 4: _PAGE_PCD
+ * Bit 3: _PAGE_PWT
+ * Bit 2: _PAGE_USER
+ * Bit 1: _PAGE_RW ("GW" below)
+ * Bit 0: _PAGE_PRESENT ("GP" below)
+ *
+ * Given a guest entry, as shown below, we can expect the following in the
+ * corresponding shadow entry:
+ *
+ * Guest entry Shadow entry Commentary
+ * ----------- ---------------- ---------------------------------------------
+ * Maps
+ * GP GW IO GP SP GW SW MMIO
+ * -- -- ---- -- -- -- -- ----
+ * - - - 0 0 0 0 0 The guest entry has not yet been shadowed.
+ * 0 - - 0 0 0 0 1 The guest entry is marked not-present.
+ * 1 1 no ? 1 ? 1 0 Writable entry in the guest.
+ * 1 0 no ? 1 0 0 0 Read-only entry in the guest.
+ * 1 1 yes 0 1 ? 1 1 Writable MMIO mapping in the guest.
+ * 1 0 yes 0 1 0 0 1 Read-only MMIO mapping in the guest.
+ *
+ * Normally, we would expect that GP=1 in the guest to imply GP=1 in the
+ * shadow, and similarly for GW=1. However, various functionality that may be
+ * implemented via the shadow can cause GP or GW to be cleared in such cases.
+ * A & D bit emulation is a prime example of such functionality.
+ *
+ * If _PAGE_SHADOW_PRESENT is zero, then the _PAGE_PRESENT bit in that same
+ * entry will always be zero, too.
+
+ * Bit 11 is used in debug builds as the _PAGE_GNTTAB bit in PV guests. It is
+ * currently available for random (ab)use in shadow entries.
+ *
+ * Bit 8 (the global bit) could be propagated from an HVM guest to the shadow,
+ * but currently there is no benefit, as the guest's TLB is flushed on every
+ * transition of CR3 anyway due to the HVM exit/re-entry.
+ *
+ * In shadow entries in which the _PAGE_SHADOW_PRESENT is set, bit 8 is used
+ * as the _PAGE_SHADOW_MMIO bit. In such entries, if _PAGE_SHADOW_MMIO is
+ * set, then the entry contains the *gfn* directly from the corresponding
+ * guest entry (not an mfn!!).
+ *
+ * Bit 7 is set in a guest L2 to signify a superpage entry. The current
+ * shadow code splinters superpage mappings into 512 or 1024 4K mappings; the
+ * resulting shadow L1 table is called an FL1. Note that there is no guest
+ * page that corresponds to an FL1.
+ *
+ * Bit 7 in a guest L1 is the PAT2 bit. Currently we do not support PAT in
+ * this shadow code.
+ *
+ * Bit 6 is the dirty bit.
+ *
+ * Bit 5 is the accessed bit.
+ *
+ * Bit 4 is the cache disable bit. If set in a guest, the hardware is
+ * supposed to refuse to cache anything found via this entry. It can be set
+ * in an L4e, L3e, L2e, or L1e. This shadow code currently does not support
+ * cache disable bits. They are silently ignored.
+ *
+ * Bit 4 is a guest L1 is also the PAT1 bit. Currently we do not support PAT
+ * in this shadow code.
+ *
+ * Bit 3 is the cache write-thru bit. If set in a guest, the hardware is
+ * supposed to use write-thru instead of write-back caching for anything found
+ * via this entry. It can be set in an L4e, L3e, L2e, or L1e. This shadow
+ * code currently does not support cache write-thru bits. They are silently
+ * ignored.
+ *
+ * Bit 3 is a guest L1 is also the PAT0 bit. Currently we do not support PAT
+ * in this shadow code.
+ *
+ * Bit 2 is the user bit.
+ *
+ * Bit 1 is the read-write bit.
+ *
+ * Bit 0 is the present bit.
+ */
+
+// Copy of the _PAGE_RW bit from the guest's PTE, appropriately zero'ed by
+// the appropriate shadow rules.
+#define _PAGE_SHADOW_RW _PAGE_AVAIL1
+
+// Copy of the _PAGE_PRESENT bit from the guest's PTE
+#define _PAGE_SHADOW_PRESENT _PAGE_AVAIL0
+
+// The matching guest entry maps MMIO space
+#define _PAGE_SHADOW_MMIO _PAGE_GLOBAL
+
+// Shadow flags value used when the guest is not present
+#define _PAGE_SHADOW_GUEST_NOT_PRESENT _PAGE_GLOBAL
+
+
+/******************************************************************************
+ * Debug and error-message output
+ */
+#define SHADOW_PRINTK(_f, _a...) \
+ debugtrace_printk("sh: %s(): " _f, __func__, ##_a)
+#define SHADOW_ERROR(_f, _a...) \
+ printk("sh error: %s(): " _f, __func__, ##_a)
+#define SHADOW_DEBUG(flag, _f, _a...) \
+ do { \
+ if (SHADOW_DEBUG_ ## flag) \
+ debugtrace_printk("shdebug: %s(): " _f, __func__, ##_a); \
+ } while (0)
+
+// The flags for use with SHADOW_DEBUG:
+#define SHADOW_DEBUG_PROPAGATE 0
+#define SHADOW_DEBUG_MAKE_SHADOW 0
+#define SHADOW_DEBUG_DESTROY_SHADOW 0
+#define SHADOW_DEBUG_P2M 0
+#define SHADOW_DEBUG_A_AND_D 0
+#define SHADOW_DEBUG_EMULATE 0
+#define SHADOW_DEBUG_LOGDIRTY 1
+
+
+/******************************************************************************
+ * Auditing routines
+ */
+
+#if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL
+extern void shadow_audit_tables(struct vcpu *v);
+#else
+#define shadow_audit_tables(_v) do {} while(0)
+#endif
+
+#if SHADOW_AUDIT & SHADOW_AUDIT_P2M
+extern void shadow_audit_p2m(struct domain *d);
+#else
+#define shadow_audit_p2m(_d) do {} while(0)
+#endif
+
+
+/******************************************************************************
+ * Mechanism for double-checking the optimized pagefault path: this
+ * structure contains a record of actions taken by the fault handling
+ * code. In paranoid mode, the fast-path code fills out one of these
+ * structures (but doesn't take any actual action) and then the normal
+ * path fills in another. When the fault handler finishes, the
+ * two are compared */
+
+#ifdef SHADOW_OPTIMIZATION_PARANOIA
+
+typedef struct shadow_action_log sh_log_t;
+struct shadow_action_log {
+ paddr_t ad[CONFIG_PAGING_LEVELS]; /* A & D bits propagated here */
+ paddr_t mmio; /* Address of an mmio operation */
+ int rv; /* Result of the fault handler */
+};
+
+/* There are two logs, one for the fast path, one for the normal path */
+enum sh_log_type { log_slow = 0, log_fast= 1 };
+
+/* Alloc and zero the logs */
+static inline void sh_init_log(struct vcpu *v)
+{
+ if ( unlikely(!v->arch.shadow.action_log) )
+ v->arch.shadow.action_log = xmalloc_array(sh_log_t, 2);
+ ASSERT(v->arch.shadow.action_log);
+ memset(v->arch.shadow.action_log, 0, 2 * sizeof (sh_log_t));
+}
+
+/* Log an A&D-bit update */
+static inline void sh_log_ad(struct vcpu *v, paddr_t e, unsigned int level)
+{
+ v->arch.shadow.action_log[v->arch.shadow.action_index].ad[level] = e;
+}
+
+/* Log an MMIO address */
+static inline void sh_log_mmio(struct vcpu *v, paddr_t m)
+{
+ v->arch.shadow.action_log[v->arch.shadow.action_index].mmio = m;
+}
+
+/* Log the result */
+static inline void sh_log_rv(struct vcpu *v, int rv)
+{
+ v->arch.shadow.action_log[v->arch.shadow.action_index].rv = rv;
+}
+
+/* Set which mode we're in */
+static inline void sh_set_log_mode(struct vcpu *v, enum sh_log_type t)
+{
+ v->arch.shadow.action_index = t;
+}
+
+/* Know not to take action, because we're only checking the mechanism */
+static inline int sh_take_no_action(struct vcpu *v)
+{
+ return (v->arch.shadow.action_index == log_fast);
+}
+
+#else /* Non-paranoid mode: these logs do not exist */
+
+#define sh_init_log(_v) do { (void)(_v); } while(0)
+#define sh_set_log_mode(_v,_t) do { (void)(_v); } while(0)
+#define sh_log_ad(_v,_e,_l) do { (void)(_v),(void)(_e),(void)(_l); } while (0)
+#define sh_log_mmio(_v,_m) do { (void)(_v),(void)(_m); } while (0)
+#define sh_log_rv(_v,_r) do { (void)(_v),(void)(_r); } while (0)
+#define sh_take_no_action(_v) (((void)(_v)), 0)
+
+#endif /* SHADOW_OPTIMIZATION_PARANOIA */
+
+
+/******************************************************************************
+ * Macro for dealing with the naming of the internal names of the
+ * shadow code's external entry points.
+ */
+#define SHADOW_INTERNAL_NAME_HIDDEN(name, shadow_levels, guest_levels) \
+ name ## __shadow_ ## shadow_levels ## _guest_ ## guest_levels
+#define SHADOW_INTERNAL_NAME(name, shadow_levels, guest_levels) \
+ SHADOW_INTERNAL_NAME_HIDDEN(name, shadow_levels, guest_levels)
+
+#if CONFIG_PAGING_LEVELS == 2
+#define GUEST_LEVELS 2
+#define SHADOW_LEVELS 2
+#include "multi.h"
+#undef GUEST_LEVELS
+#undef SHADOW_LEVELS
+#endif /* CONFIG_PAGING_LEVELS == 2 */
+
+#if CONFIG_PAGING_LEVELS == 3
+#define GUEST_LEVELS 2
+#define SHADOW_LEVELS 3
+#include "multi.h"
+#undef GUEST_LEVELS
+#undef SHADOW_LEVELS
+
+#define GUEST_LEVELS 3
+#define SHADOW_LEVELS 3
+#include "multi.h"
+#undef GUEST_LEVELS
+#undef SHADOW_LEVELS
+#endif /* CONFIG_PAGING_LEVELS == 3 */
+
+#if CONFIG_PAGING_LEVELS == 4
+#define GUEST_LEVELS 2
+#define SHADOW_LEVELS 3
+#include "multi.h"
+#undef GUEST_LEVELS
+#undef SHADOW_LEVELS
+
+#define GUEST_LEVELS 3
+#define SHADOW_LEVELS 3
+#include "multi.h"
+#undef GUEST_LEVELS
+#undef SHADOW_LEVELS
+
+#define GUEST_LEVELS 3
+#define SHADOW_LEVELS 4
+#include "multi.h"
+#undef GUEST_LEVELS
+#undef SHADOW_LEVELS
+
+#define GUEST_LEVELS 4
+#define SHADOW_LEVELS 4
+#include "multi.h"
+#undef GUEST_LEVELS
+#undef SHADOW_LEVELS
+#endif /* CONFIG_PAGING_LEVELS == 4 */
+
+
+/******************************************************************************
+ * Various function declarations
+ */
+
+/* x86 emulator support */
+extern struct x86_emulate_ops shadow_emulator_ops;
+
+/* Hash table functions */
+mfn_t shadow_hash_lookup(struct vcpu *v, unsigned long n, u8 t);
+void shadow_hash_insert(struct vcpu *v, unsigned long n, u8 t, mfn_t smfn);
+void shadow_hash_delete(struct vcpu *v, unsigned long n, u8 t, mfn_t smfn);
+
+/* shadow promotion */
+void shadow_promote(struct vcpu *v, mfn_t gmfn, u32 type);
+void shadow_demote(struct vcpu *v, mfn_t gmfn, u32 type);
+
+/* Shadow page allocation functions */
+void shadow_prealloc(struct domain *d, unsigned int order);
+mfn_t shadow_alloc(struct domain *d,
+ u32 shadow_type,
+ unsigned long backpointer);
+void shadow_free(struct domain *d, mfn_t smfn);
+
+/* Function to convert a shadow to log-dirty */
+void shadow_convert_to_log_dirty(struct vcpu *v, mfn_t smfn);
+
+/* Dispatcher function: call the per-mode function that will unhook the
+ * non-Xen mappings in this top-level shadow mfn */
+void shadow_unhook_mappings(struct vcpu *v, mfn_t smfn);
+
+/* Re-sync copies of PAE shadow L3 tables if they have been changed */
+void sh_pae_recopy(struct domain *d);
+
+/* Install the xen mappings in various flavours of shadow */
+void sh_install_xen_entries_in_l4(struct vcpu *v, mfn_t gl4mfn, mfn_t sl4mfn);
+void sh_install_xen_entries_in_l2h(struct vcpu *v, mfn_t sl2hmfn);
+void sh_install_xen_entries_in_l3(struct vcpu *v, mfn_t gl3mfn, mfn_t sl3mfn);
+void sh_install_xen_entries_in_l2(struct vcpu *v, mfn_t gl2mfn, mfn_t sl2mfn);
+
+
+/******************************************************************************
+ * MFN/page-info handling
+ */
+
+// Override mfn_to_page from asm/page.h, which was #include'd above,
+// in order to make it work with our mfn type.
+#undef mfn_to_page
+#define mfn_to_page(_mfn) (frame_table + mfn_x(_mfn))
+
+// Override page_to_mfn from asm/page.h, which was #include'd above,
+// in order to make it work with our mfn type.
+#undef page_to_mfn
+#define page_to_mfn(_pg) (_mfn((_pg) - frame_table))
+
+// Override mfn_valid from asm/page.h, which was #include'd above,
+// in order to make it work with our mfn type.
+#undef mfn_valid
+#define mfn_valid(_mfn) (mfn_x(_mfn) < max_page)
+
+// Provide mfn_t-aware versions of common xen functions
+static inline void *
+sh_map_domain_page(mfn_t mfn)
+{
+ /* XXX Using the monitor-table as a map will happen here */
+ return map_domain_page(mfn_x(mfn));
+}
+
+static inline void
+sh_unmap_domain_page(void *p)
+{
+ /* XXX Using the monitor-table as a map will happen here */
+ unmap_domain_page(p);
+}
+
+static inline void *
+sh_map_domain_page_global(mfn_t mfn)
+{
+ /* XXX Using the monitor-table as a map will happen here */
+ return map_domain_page_global(mfn_x(mfn));
+}
+
+static inline void
+sh_unmap_domain_page_global(void *p)
+{
+ /* XXX Using the monitor-table as a map will happen here */
+ unmap_domain_page_global(p);
+}
+
+static inline int
+sh_mfn_is_dirty(struct domain *d, mfn_t gmfn)
+/* Is this guest page dirty? Call only in log-dirty mode. */
+{
+ unsigned long pfn;
+ ASSERT(shadow_mode_log_dirty(d));
+ ASSERT(d->arch.shadow.dirty_bitmap != NULL);
+
+ /* We /really/ mean PFN here, even for non-translated guests. */
+ pfn = get_gpfn_from_mfn(mfn_x(gmfn));
+ if ( likely(VALID_M2P(pfn))
+ && likely(pfn < d->arch.shadow.dirty_bitmap_size)
+ && test_bit(pfn, d->arch.shadow.dirty_bitmap) )
+ return 1;
+
+ return 0;
+}
+
+static inline int
+sh_mfn_is_a_page_table(mfn_t gmfn)
+{
+ struct page_info *page = mfn_to_page(gmfn);
+ struct domain *owner;
+ unsigned long type_info;
+
+ if ( !valid_mfn(gmfn) )
+ return 0;
+
+ owner = page_get_owner(page);
+ if ( owner && shadow_mode_refcounts(owner)
+ && (page->count_info & PGC_page_table) )
+ return 1;
+
+ type_info = page->u.inuse.type_info & PGT_type_mask;
+ return type_info && (type_info <= PGT_l4_page_table);
+}
+
+
+/**************************************************************************/
+/* Shadow-page refcounting. See comment in shadow-common.c about the
+ * use of struct page_info fields for shadow pages */
+
+void sh_destroy_shadow(struct vcpu *v, mfn_t smfn);
+
+/* Increase the refcount of a shadow page. Arguments are the mfn to refcount,
+ * and the physical address of the shadow entry that holds the ref (or zero
+ * if the ref is held by something else) */
+static inline void sh_get_ref(mfn_t smfn, paddr_t entry_pa)
+{
+ u32 x, nx;
+ struct page_info *page = mfn_to_page(smfn);
+
+ ASSERT(mfn_valid(smfn));
+
+ x = page->count_info & PGC_SH_count_mask;
+ nx = x + 1;
+
+ if ( unlikely(nx & ~PGC_SH_count_mask) )
+ {
+ SHADOW_PRINTK("shadow ref overflow, gmfn=%" PRtype_info " smfn=%lx\n",
+ page->u.inuse.type_info, mfn_x(smfn));
+ domain_crash_synchronous();
+ }
+
+ /* Guarded by the shadow lock, so no need for atomic update */
+ page->count_info &= ~PGC_SH_count_mask;
+ page->count_info |= nx;
+
+ /* We remember the first shadow entry that points to each shadow. */
+ if ( entry_pa != 0 && page->up == 0 )
+ page->up = entry_pa;
+}
+
+
+/* Decrease the refcount of a shadow page. As for get_ref, takes the
+ * physical address of the shadow entry that held this reference. */
+static inline void sh_put_ref(struct vcpu *v, mfn_t smfn, paddr_t entry_pa)
+{
+ u32 x, nx;
+ struct page_info *page = mfn_to_page(smfn);
+
+ ASSERT(mfn_valid(smfn));
+ ASSERT(page_get_owner(page) == NULL);
+
+ /* If this is the entry in the up-pointer, remove it */
+ if ( entry_pa != 0 && page->up == entry_pa )
+ page->up = 0;
+
+ x = page->count_info & PGC_SH_count_mask;
+ nx = x - 1;
+
+ if ( unlikely(x == 0) )
+ {
+ SHADOW_PRINTK("shadow ref underflow, smfn=%lx oc=%08x t=%"
+ PRtype_info "\n",
+ mfn_x(smfn),
+ page->count_info & PGC_SH_count_mask,
+ page->u.inuse.type_info);
+ domain_crash_synchronous();
+ }
+
+ /* Guarded by the shadow lock, so no need for atomic update */
+ page->count_info &= ~PGC_SH_count_mask;
+ page->count_info |= nx;
+
+ if ( unlikely(nx == 0) )
+ sh_destroy_shadow(v, smfn);
+}
+
+
+/* Pin a shadow page: take an extra refcount and set the pin bit. */
+static inline void sh_pin(mfn_t smfn)
+{
+ struct page_info *page;
+
+ ASSERT(mfn_valid(smfn));
+ page = mfn_to_page(smfn);
+ if ( !(page->count_info & PGC_SH_pinned) )
+ {
+ sh_get_ref(smfn, 0);
+ page->count_info |= PGC_SH_pinned;
+ }
+}
+
+/* Unpin a shadow page: unset the pin bit and release the extra ref. */
+static inline void sh_unpin(struct vcpu *v, mfn_t smfn)
+{
+ struct page_info *page;
+
+ ASSERT(mfn_valid(smfn));
+ page = mfn_to_page(smfn);
+ if ( page->count_info & PGC_SH_pinned )
+ {
+ page->count_info &= ~PGC_SH_pinned;
+ sh_put_ref(v, smfn, 0);
+ }
+}
+
+/**************************************************************************/
+/* Guest physmap (p2m) support */
+
+/* Read our own P2M table, checking in the linear pagetables first to be
+ * sure that we will succeed. Call this function if you expect it to
+ * fail often, as it avoids page faults. If you expect to succeed, use
+ * vcpu_gfn_to_mfn, which copy_from_user()s the entry */
+static inline mfn_t
+vcpu_gfn_to_mfn_nofault(struct vcpu *v, unsigned long gfn)
+{
+ unsigned long entry_addr = (unsigned long) &phys_to_machine_mapping[gfn];
+#if CONFIG_PAGING_LEVELS >= 4
+ l4_pgentry_t *l4e;
+ l3_pgentry_t *l3e;
+#endif
+ l2_pgentry_t *l2e;
+ l1_pgentry_t *l1e;
+
+ ASSERT(current == v);
+ if ( !shadow_vcpu_mode_translate(v) )
+ return _mfn(gfn);
+
+#if CONFIG_PAGING_LEVELS > 2
+ if ( gfn > (RO_MPT_VIRT_END - RO_MPT_VIRT_START) / sizeof(l1_pgentry_t) )
+ /* This pfn is higher than the p2m map can hold */
+ return _mfn(INVALID_MFN);
+#endif
+
+ /* Walk the linear pagetables. Note that this is *not* the same as
+ * the walk in sh_gfn_to_mfn_foreign, which is walking the p2m map */
+#if CONFIG_PAGING_LEVELS >= 4
+ l4e = __linear_l4_table + l4_linear_offset(entry_addr);
+ if ( !(l4e_get_flags(*l4e) & _PAGE_PRESENT) ) return _mfn(INVALID_MFN);
+ l3e = __linear_l3_table + l3_linear_offset(entry_addr);
+ if ( !(l3e_get_flags(*l3e) & _PAGE_PRESENT) ) return _mfn(INVALID_MFN);
+#endif
+ l2e = __linear_l2_table + l2_linear_offset(entry_addr);
+ if ( !(l2e_get_flags(*l2e) & _PAGE_PRESENT) ) return _mfn(INVALID_MFN);
+ l1e = __linear_l1_table + l1_linear_offset(entry_addr);
+ if ( !(l1e_get_flags(*l1e) & _PAGE_PRESENT) ) return _mfn(INVALID_MFN);
+
+ /* Safe to look at this part of the table */
+ if ( l1e_get_flags(phys_to_machine_mapping[gfn]) & _PAGE_PRESENT )
+ return _mfn(l1e_get_pfn(phys_to_machine_mapping[gfn]));
+
+ return _mfn(INVALID_MFN);
+}
+
+
+#endif /* _XEN_SHADOW_PRIVATE_H */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+/******************************************************************************
+ * arch/x86/mm/shadow/types.h
+ *
+ * Parts of this code are Copyright (c) 2006 by XenSource Inc.
+ * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
+ * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef _XEN_SHADOW_TYPES_H
+#define _XEN_SHADOW_TYPES_H
+
+// Map a shadow page
+static inline void *
+map_shadow_page(mfn_t smfn)
+{
+ // XXX -- Possible optimization/measurement question for 32-bit and PAE
+ // hypervisors:
+ // How often is this smfn already available in the shadow linear
+ // table? Might it be worth checking that table first,
+ // presumably using the reverse map hint in the page_info of this
+ // smfn, rather than calling map_domain_page()?
+ //
+ return sh_map_domain_page(smfn);
+}
+
+// matching unmap for map_shadow_page()
+static inline void
+unmap_shadow_page(void *p)
+{
+ sh_unmap_domain_page(p);
+}
+
+/*
+ * Define various types for handling pagetabels, based on these options:
+ * SHADOW_PAGING_LEVELS : Number of levels of shadow pagetables
+ * GUEST_PAGING_LEVELS : Number of levels of guest pagetables
+ */
+
+#if (CONFIG_PAGING_LEVELS < SHADOW_PAGING_LEVELS)
+#error Cannot have more levels of shadow pagetables than host pagetables
+#endif
+
+#if (SHADOW_PAGING_LEVELS < GUEST_PAGING_LEVELS)
+#error Cannot have more levels of guest pagetables than shadow pagetables
+#endif
+
+#if SHADOW_PAGING_LEVELS == 2
+#define SHADOW_L1_PAGETABLE_ENTRIES 1024
+#define SHADOW_L2_PAGETABLE_ENTRIES 1024
+#define SHADOW_L1_PAGETABLE_SHIFT 12
+#define SHADOW_L2_PAGETABLE_SHIFT 22
+#endif
+
+#if SHADOW_PAGING_LEVELS == 3
+#define SHADOW_L1_PAGETABLE_ENTRIES 512
+#define SHADOW_L2_PAGETABLE_ENTRIES 512
+#define SHADOW_L3_PAGETABLE_ENTRIES 4
+#define SHADOW_L1_PAGETABLE_SHIFT 12
+#define SHADOW_L2_PAGETABLE_SHIFT 21
+#define SHADOW_L3_PAGETABLE_SHIFT 30
+#endif
+
+#if SHADOW_PAGING_LEVELS == 4
+#define SHADOW_L1_PAGETABLE_ENTRIES 512
+#define SHADOW_L2_PAGETABLE_ENTRIES 512
+#define SHADOW_L3_PAGETABLE_ENTRIES 512
+#define SHADOW_L4_PAGETABLE_ENTRIES 512
+#define SHADOW_L1_PAGETABLE_SHIFT 12
+#define SHADOW_L2_PAGETABLE_SHIFT 21
+#define SHADOW_L3_PAGETABLE_SHIFT 30
+#define SHADOW_L4_PAGETABLE_SHIFT 39
+#endif
+
+/* Types of the shadow page tables */
+typedef l1_pgentry_t shadow_l1e_t;
+typedef l2_pgentry_t shadow_l2e_t;
+#if SHADOW_PAGING_LEVELS >= 3
+typedef l3_pgentry_t shadow_l3e_t;
+#if SHADOW_PAGING_LEVELS >= 4
+typedef l4_pgentry_t shadow_l4e_t;
+#endif
+#endif
+
+/* Access functions for them */
+static inline paddr_t shadow_l1e_get_paddr(shadow_l1e_t sl1e)
+{ return l1e_get_paddr(sl1e); }
+static inline paddr_t shadow_l2e_get_paddr(shadow_l2e_t sl2e)
+{ return l2e_get_paddr(sl2e); }
+#if SHADOW_PAGING_LEVELS >= 3
+static inline paddr_t shadow_l3e_get_paddr(shadow_l3e_t sl3e)
+{ return l3e_get_paddr(sl3e); }
+#if SHADOW_PAGING_LEVELS >= 4
+static inline paddr_t shadow_l4e_get_paddr(shadow_l4e_t sl4e)
+{ return l4e_get_paddr(sl4e); }
+#endif
+#endif
+
+static inline mfn_t shadow_l1e_get_mfn(shadow_l1e_t sl1e)
+{ return _mfn(l1e_get_pfn(sl1e)); }
+static inline mfn_t shadow_l2e_get_mfn(shadow_l2e_t sl2e)
+{ return _mfn(l2e_get_pfn(sl2e)); }
+#if SHADOW_PAGING_LEVELS >= 3
+static inline mfn_t shadow_l3e_get_mfn(shadow_l3e_t sl3e)
+{ return _mfn(l3e_get_pfn(sl3e)); }
+#if SHADOW_PAGING_LEVELS >= 4
+static inline mfn_t shadow_l4e_get_mfn(shadow_l4e_t sl4e)
+{ return _mfn(l4e_get_pfn(sl4e)); }
+#endif
+#endif
+
+static inline u32 shadow_l1e_get_flags(shadow_l1e_t sl1e)
+{ return l1e_get_flags(sl1e); }
+static inline u32 shadow_l2e_get_flags(shadow_l2e_t sl2e)
+{ return l2e_get_flags(sl2e); }
+#if SHADOW_PAGING_LEVELS >= 3
+static inline u32 shadow_l3e_get_flags(shadow_l3e_t sl3e)
+{ return l3e_get_flags(sl3e); }
+#if SHADOW_PAGING_LEVELS >= 4
+static inline u32 shadow_l4e_get_flags(shadow_l4e_t sl4e)
+{ return l4e_get_flags(sl4e); }
+#endif
+#endif
+
+static inline shadow_l1e_t
+shadow_l1e_remove_flags(shadow_l1e_t sl1e, u32 flags)
+{ l1e_remove_flags(sl1e, flags); return sl1e; }
+
+static inline shadow_l1e_t shadow_l1e_empty(void)
+{ return l1e_empty(); }
+static inline shadow_l2e_t shadow_l2e_empty(void)
+{ return l2e_empty(); }
+#if SHADOW_PAGING_LEVELS >= 3
+static inline shadow_l3e_t shadow_l3e_empty(void)
+{ return l3e_empty(); }
+#if SHADOW_PAGING_LEVELS >= 4
+static inline shadow_l4e_t shadow_l4e_empty(void)
+{ return l4e_empty(); }
+#endif
+#endif
+
+static inline shadow_l1e_t shadow_l1e_from_mfn(mfn_t mfn, u32 flags)
+{ return l1e_from_pfn(mfn_x(mfn), flags); }
+static inline shadow_l2e_t shadow_l2e_from_mfn(mfn_t mfn, u32 flags)
+{ return l2e_from_pfn(mfn_x(mfn), flags); }
+#if SHADOW_PAGING_LEVELS >= 3
+static inline shadow_l3e_t shadow_l3e_from_mfn(mfn_t mfn, u32 flags)
+{ return l3e_from_pfn(mfn_x(mfn), flags); }
+#if SHADOW_PAGING_LEVELS >= 4
+static inline shadow_l4e_t shadow_l4e_from_mfn(mfn_t mfn, u32 flags)
+{ return l4e_from_pfn(mfn_x(mfn), flags); }
+#endif
+#endif
+
+#define shadow_l1_table_offset(a) l1_table_offset(a)
+#define shadow_l2_table_offset(a) l2_table_offset(a)
+#define shadow_l3_table_offset(a) l3_table_offset(a)
+#define shadow_l4_table_offset(a) l4_table_offset(a)
+
+/**************************************************************************/
+/* Access to the linear mapping of shadow page tables. */
+
+/* Offsets into each level of the linear mapping for a virtual address. */
+#define shadow_l1_linear_offset(_a) \
+ (((_a) & VADDR_MASK) >> SHADOW_L1_PAGETABLE_SHIFT)
+#define shadow_l2_linear_offset(_a) \
+ (((_a) & VADDR_MASK) >> SHADOW_L2_PAGETABLE_SHIFT)
+#define shadow_l3_linear_offset(_a) \
+ (((_a) & VADDR_MASK) >> SHADOW_L3_PAGETABLE_SHIFT)
+#define shadow_l4_linear_offset(_a) \
+ (((_a) & VADDR_MASK) >> SHADOW_L4_PAGETABLE_SHIFT)
+
+/* Where to find each level of the linear mapping. For PV guests, we use
+ * the shadow linear-map self-entry as many times as we need. For HVM
+ * guests, the shadow doesn't have a linear-map self-entry so we must use
+ * the monitor-table's linear-map entry N-1 times and then the shadow-map
+ * entry once. */
+#define __sh_linear_l1_table ((shadow_l1e_t *)(SH_LINEAR_PT_VIRT_START))
+#define __sh_linear_l2_table ((shadow_l2e_t *) \
+ (__sh_linear_l1_table + shadow_l1_linear_offset(SH_LINEAR_PT_VIRT_START)))
+
+// shadow linear L3 and L4 tables only exist in 4 level paging...
+#if SHADOW_PAGING_LEVELS == 4
+#define __sh_linear_l3_table ((shadow_l3e_t *) \
+ (__sh_linear_l2_table + shadow_l2_linear_offset(SH_LINEAR_PT_VIRT_START)))
+#define __sh_linear_l4_table ((shadow_l4e_t *) \
+ (__sh_linear_l3_table + shadow_l3_linear_offset(SH_LINEAR_PT_VIRT_START)))
+#endif
+
+#define sh_linear_l1_table(v) ({ \
+ ASSERT(current == (v)); \
+ __sh_linear_l1_table; \
+})
+
+#define sh_linear_l2_table(v) ({ \
+ ASSERT(current == (v)); \
+ ((shadow_l2e_t *) \
+ (hvm_guest(v) ? __linear_l1_table : __sh_linear_l1_table) + \
+ shadow_l1_linear_offset(SH_LINEAR_PT_VIRT_START)); \
+})
+
+// shadow linear L3 and L4 tables only exist in 4 level paging...
+#if SHADOW_PAGING_LEVELS == 4
+#define sh_linear_l3_table(v) ({ \
+ ASSERT(current == (v)); \
+ ((shadow_l3e_t *) \
+ (hvm_guest(v) ? __linear_l2_table : __sh_linear_l2_table) + \
+ shadow_l2_linear_offset(SH_LINEAR_PT_VIRT_START)); \
+})
+
+// we use l4_pgentry_t instead of shadow_l4e_t below because shadow_l4e_t is
+// not defined for when xen_levels==4 & shadow_levels==3...
+#define sh_linear_l4_table(v) ({ \
+ ASSERT(current == (v)); \
+ ((l4_pgentry_t *) \
+ (hvm_guest(v) ? __linear_l3_table : __sh_linear_l3_table) + \
+ shadow_l3_linear_offset(SH_LINEAR_PT_VIRT_START)); \
+})
+#endif
+
+#if GUEST_PAGING_LEVELS == 2
+
+#include "page-guest32.h"
+
+#define GUEST_L1_PAGETABLE_ENTRIES 1024
+#define GUEST_L2_PAGETABLE_ENTRIES 1024
+#define GUEST_L1_PAGETABLE_SHIFT 12
+#define GUEST_L2_PAGETABLE_SHIFT 22
+
+/* Type of the guest's frame numbers */
+TYPE_SAFE(u32,gfn)
+#define INVALID_GFN ((u32)(-1u))
+#define SH_PRI_gfn "05x"
+
+/* Types of the guest's page tables */
+typedef l1_pgentry_32_t guest_l1e_t;
+typedef l2_pgentry_32_t guest_l2e_t;
+
+/* Access functions for them */
+static inline paddr_t guest_l1e_get_paddr(guest_l1e_t gl1e)
+{ return l1e_get_paddr_32(gl1e); }
+static inline paddr_t guest_l2e_get_paddr(guest_l2e_t gl2e)
+{ return l2e_get_paddr_32(gl2e); }
+
+static inline gfn_t guest_l1e_get_gfn(guest_l1e_t gl1e)
+{ return _gfn(l1e_get_paddr_32(gl1e) >> PAGE_SHIFT); }
+static inline gfn_t guest_l2e_get_gfn(guest_l2e_t gl2e)
+{ return _gfn(l2e_get_paddr_32(gl2e) >> PAGE_SHIFT); }
+
+static inline u32 guest_l1e_get_flags(guest_l1e_t gl1e)
+{ return l1e_get_flags_32(gl1e); }
+static inline u32 guest_l2e_get_flags(guest_l2e_t gl2e)
+{ return l2e_get_flags_32(gl2e); }
+
+static inline guest_l1e_t guest_l1e_add_flags(guest_l1e_t gl1e, u32 flags)
+{ l1e_add_flags_32(gl1e, flags); return gl1e; }
+static inline guest_l2e_t guest_l2e_add_flags(guest_l2e_t gl2e, u32 flags)
+{ l2e_add_flags_32(gl2e, flags); return gl2e; }
+
+static inline guest_l1e_t guest_l1e_from_gfn(gfn_t gfn, u32 flags)
+{ return l1e_from_pfn_32(gfn_x(gfn), flags); }
+static inline guest_l2e_t guest_l2e_from_gfn(gfn_t gfn, u32 flags)
+{ return l2e_from_pfn_32(gfn_x(gfn), flags); }
+
+#define guest_l1_table_offset(a) l1_table_offset_32(a)
+#define guest_l2_table_offset(a) l2_table_offset_32(a)
+
+/* The shadow types needed for the various levels. */
+#define PGC_SH_l1_shadow PGC_SH_l1_32_shadow
+#define PGC_SH_l2_shadow PGC_SH_l2_32_shadow
+#define PGC_SH_fl1_shadow PGC_SH_fl1_32_shadow
+
+#else /* GUEST_PAGING_LEVELS != 2 */
+
+#if GUEST_PAGING_LEVELS == 3
+#define GUEST_L1_PAGETABLE_ENTRIES 512
+#define GUEST_L2_PAGETABLE_ENTRIES 512
+#define GUEST_L3_PAGETABLE_ENTRIES 4
+#define GUEST_L1_PAGETABLE_SHIFT 12
+#define GUEST_L2_PAGETABLE_SHIFT 21
+#define GUEST_L3_PAGETABLE_SHIFT 30
+#else /* GUEST_PAGING_LEVELS == 4 */
+#define GUEST_L1_PAGETABLE_ENTRIES 512
+#define GUEST_L2_PAGETABLE_ENTRIES 512
+#define GUEST_L3_PAGETABLE_ENTRIES 512
+#define GUEST_L4_PAGETABLE_ENTRIES 512
+#define GUEST_L1_PAGETABLE_SHIFT 12
+#define GUEST_L2_PAGETABLE_SHIFT 21
+#define GUEST_L3_PAGETABLE_SHIFT 30
+#define GUEST_L4_PAGETABLE_SHIFT 39
+#endif
+
+/* Type of the guest's frame numbers */
+TYPE_SAFE(unsigned long,gfn)
+#define INVALID_GFN ((unsigned long)(-1ul))
+#define SH_PRI_gfn "05lx"
+
+/* Types of the guest's page tables */
+typedef l1_pgentry_t guest_l1e_t;
+typedef l2_pgentry_t guest_l2e_t;
+typedef l3_pgentry_t guest_l3e_t;
+#if GUEST_PAGING_LEVELS >= 4
+typedef l4_pgentry_t guest_l4e_t;
+#endif
+
+/* Access functions for them */
+static inline paddr_t guest_l1e_get_paddr(guest_l1e_t gl1e)
+{ return l1e_get_paddr(gl1e); }
+static inline paddr_t guest_l2e_get_paddr(guest_l2e_t gl2e)
+{ return l2e_get_paddr(gl2e); }
+static inline paddr_t guest_l3e_get_paddr(guest_l3e_t gl3e)
+{ return l3e_get_paddr(gl3e); }
+#if GUEST_PAGING_LEVELS >= 4
+static inline paddr_t guest_l4e_get_paddr(guest_l4e_t gl4e)
+{ return l4e_get_paddr(gl4e); }
+#endif
+
+static inline gfn_t guest_l1e_get_gfn(guest_l1e_t gl1e)
+{ return _gfn(l1e_get_paddr(gl1e) >> PAGE_SHIFT); }
+static inline gfn_t guest_l2e_get_gfn(guest_l2e_t gl2e)
+{ return _gfn(l2e_get_paddr(gl2e) >> PAGE_SHIFT); }
+static inline gfn_t guest_l3e_get_gfn(guest_l3e_t gl3e)
+{ return _gfn(l3e_get_paddr(gl3e) >> PAGE_SHIFT); }
+#if GUEST_PAGING_LEVELS >= 4
+static inline gfn_t guest_l4e_get_gfn(guest_l4e_t gl4e)
+{ return _gfn(l4e_get_paddr(gl4e) >> PAGE_SHIFT); }
+#endif
+
+static inline u32 guest_l1e_get_flags(guest_l1e_t gl1e)
+{ return l1e_get_flags(gl1e); }
+static inline u32 guest_l2e_get_flags(guest_l2e_t gl2e)
+{ return l2e_get_flags(gl2e); }
+static inline u32 guest_l3e_get_flags(guest_l3e_t gl3e)
+{ return l3e_get_flags(gl3e); }
+#if GUEST_PAGING_LEVELS >= 4
+static inline u32 guest_l4e_get_flags(guest_l4e_t gl4e)
+{ return l4e_get_flags(gl4e); }
+#endif
+
+static inline guest_l1e_t guest_l1e_add_flags(guest_l1e_t gl1e, u32 flags)
+{ l1e_add_flags(gl1e, flags); return gl1e; }
+static inline guest_l2e_t guest_l2e_add_flags(guest_l2e_t gl2e, u32 flags)
+{ l2e_add_flags(gl2e, flags); return gl2e; }
+static inline guest_l3e_t guest_l3e_add_flags(guest_l3e_t gl3e, u32 flags)
+{ l3e_add_flags(gl3e, flags); return gl3e; }
+#if GUEST_PAGING_LEVELS >= 4
+static inline guest_l4e_t guest_l4e_add_flags(guest_l4e_t gl4e, u32 flags)
+{ l4e_add_flags(gl4e, flags); return gl4e; }
+#endif
+
+static inline guest_l1e_t guest_l1e_from_gfn(gfn_t gfn, u32 flags)
+{ return l1e_from_pfn(gfn_x(gfn), flags); }
+static inline guest_l2e_t guest_l2e_from_gfn(gfn_t gfn, u32 flags)
+{ return l2e_from_pfn(gfn_x(gfn), flags); }
+static inline guest_l3e_t guest_l3e_from_gfn(gfn_t gfn, u32 flags)
+{ return l3e_from_pfn(gfn_x(gfn), flags); }
+#if GUEST_PAGING_LEVELS >= 4
+static inline guest_l4e_t guest_l4e_from_gfn(gfn_t gfn, u32 flags)
+{ return l4e_from_pfn(gfn_x(gfn), flags); }
+#endif
+
+#define guest_l1_table_offset(a) l1_table_offset(a)
+#define guest_l2_table_offset(a) l2_table_offset(a)
+#define guest_l3_table_offset(a) l3_table_offset(a)
+#define guest_l4_table_offset(a) l4_table_offset(a)
+
+/* The shadow types needed for the various levels. */
+#if GUEST_PAGING_LEVELS == 3
+#define PGC_SH_l1_shadow PGC_SH_l1_pae_shadow
+#define PGC_SH_fl1_shadow PGC_SH_fl1_pae_shadow
+#define PGC_SH_l2_shadow PGC_SH_l2_pae_shadow
+#define PGC_SH_l2h_shadow PGC_SH_l2h_pae_shadow
+#define PGC_SH_l3_shadow PGC_SH_l3_pae_shadow
+#else
+#define PGC_SH_l1_shadow PGC_SH_l1_64_shadow
+#define PGC_SH_fl1_shadow PGC_SH_fl1_64_shadow
+#define PGC_SH_l2_shadow PGC_SH_l2_64_shadow
+#define PGC_SH_l3_shadow PGC_SH_l3_64_shadow
+#define PGC_SH_l4_shadow PGC_SH_l4_64_shadow
+#endif
+
+#endif /* GUEST_PAGING_LEVELS != 2 */
+
+#define VALID_GFN(m) (m != INVALID_GFN)
+
+static inline int
+valid_gfn(gfn_t m)
+{
+ return VALID_GFN(gfn_x(m));
+}
+
+#if GUEST_PAGING_LEVELS == 2
+#define PGC_SH_guest_root_type PGC_SH_l2_32_shadow
+#elif GUEST_PAGING_LEVELS == 3
+#define PGC_SH_guest_root_type PGC_SH_l3_pae_shadow
+#else
+#define PGC_SH_guest_root_type PGC_SH_l4_64_shadow
+#endif
+
+/* Translation between mfns and gfns */
+static inline mfn_t
+vcpu_gfn_to_mfn(struct vcpu *v, gfn_t gfn)
+{
+ return sh_vcpu_gfn_to_mfn(v, gfn_x(gfn));
+}
+
+static inline gfn_t
+mfn_to_gfn(struct domain *d, mfn_t mfn)
+{
+ return _gfn(sh_mfn_to_gfn(d, mfn));
+}
+
+static inline paddr_t
+gfn_to_paddr(gfn_t gfn)
+{
+ return ((paddr_t)gfn_x(gfn)) << PAGE_SHIFT;
+}
+
+/* Type used for recording a walk through guest pagetables. It is
+ * filled in by the pagetable walk function, and also used as a cache
+ * for later walks.
+ * Any non-null pointer in this structure represents a mapping of guest
+ * memory. We must always call walk_init() before using a walk_t, and
+ * call walk_unmap() when we're done.
+ * The "Effective l1e" field is used when there isn't an l1e to point to,
+ * but we have fabricated an l1e for propagation to the shadow (e.g.,
+ * for splintering guest superpages into many shadow l1 entries). */
+typedef struct shadow_walk_t walk_t;
+struct shadow_walk_t
+{
+ unsigned long va; /* Address we were looking for */
+#if GUEST_PAGING_LEVELS >= 3
+#if GUEST_PAGING_LEVELS >= 4
+ guest_l4e_t *l4e; /* Pointer to guest's level 4 entry */
+#endif
+ guest_l3e_t *l3e; /* Pointer to guest's level 3 entry */
+#endif
+ guest_l2e_t *l2e; /* Pointer to guest's level 2 entry */
+ guest_l1e_t *l1e; /* Pointer to guest's level 1 entry */
+ guest_l1e_t eff_l1e; /* Effective level 1 entry */
+#if GUEST_PAGING_LEVELS >= 3
+#if GUEST_PAGING_LEVELS >= 4
+ mfn_t l4mfn; /* MFN that the level 4 entry is in */
+#endif
+ mfn_t l3mfn; /* MFN that the level 3 entry is in */
+#endif
+ mfn_t l2mfn; /* MFN that the level 2 entry is in */
+ mfn_t l1mfn; /* MFN that the level 1 entry is in */
+};
+
+/* macros for dealing with the naming of the internal function names of the
+ * shadow code's external entry points.
+ */
+#define INTERNAL_NAME(name) \
+ SHADOW_INTERNAL_NAME(name, SHADOW_PAGING_LEVELS, GUEST_PAGING_LEVELS)
+
+/* macros for renaming the primary entry points, so that they are more
+ * easily distinguished from a debugger
+ */
+#define sh_page_fault INTERNAL_NAME(sh_page_fault)
+#define sh_invlpg INTERNAL_NAME(sh_invlpg)
+#define sh_gva_to_gpa INTERNAL_NAME(sh_gva_to_gpa)
+#define sh_gva_to_gfn INTERNAL_NAME(sh_gva_to_gfn)
+#define sh_update_cr3 INTERNAL_NAME(sh_update_cr3)
+#define sh_remove_write_access INTERNAL_NAME(sh_remove_write_access)
+#define sh_remove_all_mappings INTERNAL_NAME(sh_remove_all_mappings)
+#define sh_remove_l1_shadow INTERNAL_NAME(sh_remove_l1_shadow)
+#define sh_remove_l2_shadow INTERNAL_NAME(sh_remove_l2_shadow)
+#define sh_remove_l3_shadow INTERNAL_NAME(sh_remove_l3_shadow)
+#define sh_map_and_validate_gl4e INTERNAL_NAME(sh_map_and_validate_gl4e)
+#define sh_map_and_validate_gl3e INTERNAL_NAME(sh_map_and_validate_gl3e)
+#define sh_map_and_validate_gl2e INTERNAL_NAME(sh_map_and_validate_gl2e)
+#define sh_map_and_validate_gl2he INTERNAL_NAME(sh_map_and_validate_gl2he)
+#define sh_map_and_validate_gl1e INTERNAL_NAME(sh_map_and_validate_gl1e)
+#define sh_destroy_l4_shadow INTERNAL_NAME(sh_destroy_l4_shadow)
+#define sh_destroy_l3_shadow INTERNAL_NAME(sh_destroy_l3_shadow)
+#define sh_destroy_l3_subshadow INTERNAL_NAME(sh_destroy_l3_subshadow)
+#define sh_unpin_all_l3_subshadows INTERNAL_NAME(sh_unpin_all_l3_subshadows)
+#define sh_destroy_l2_shadow INTERNAL_NAME(sh_destroy_l2_shadow)
+#define sh_destroy_l1_shadow INTERNAL_NAME(sh_destroy_l1_shadow)
+#define sh_unhook_32b_mappings INTERNAL_NAME(sh_unhook_32b_mappings)
+#define sh_unhook_pae_mappings INTERNAL_NAME(sh_unhook_pae_mappings)
+#define sh_unhook_64b_mappings INTERNAL_NAME(sh_unhook_64b_mappings)
+#define sh_paging_mode INTERNAL_NAME(sh_paging_mode)
+#define sh_detach_old_tables INTERNAL_NAME(sh_detach_old_tables)
+#define sh_x86_emulate_write INTERNAL_NAME(sh_x86_emulate_write)
+#define sh_x86_emulate_cmpxchg INTERNAL_NAME(sh_x86_emulate_cmpxchg)
+#define sh_x86_emulate_cmpxchg8b INTERNAL_NAME(sh_x86_emulate_cmpxchg8b)
+#define sh_audit_l1_table INTERNAL_NAME(sh_audit_l1_table)
+#define sh_audit_fl1_table INTERNAL_NAME(sh_audit_fl1_table)
+#define sh_audit_l2_table INTERNAL_NAME(sh_audit_l2_table)
+#define sh_audit_l3_table INTERNAL_NAME(sh_audit_l3_table)
+#define sh_audit_l4_table INTERNAL_NAME(sh_audit_l4_table)
+#define sh_guess_wrmap INTERNAL_NAME(sh_guess_wrmap)
+#define sh_clear_shadow_entry INTERNAL_NAME(sh_clear_shadow_entry)
+
+/* sh_make_monitor_table only depends on the number of shadow levels */
+#define sh_make_monitor_table \
+ SHADOW_INTERNAL_NAME(sh_make_monitor_table, \
+ SHADOW_PAGING_LEVELS, \
+ SHADOW_PAGING_LEVELS)
+#define sh_destroy_monitor_table \
+ SHADOW_INTERNAL_NAME(sh_destroy_monitor_table, \
+ SHADOW_PAGING_LEVELS, \
+ SHADOW_PAGING_LEVELS)
+
+
+#if GUEST_PAGING_LEVELS == 3
+/*
+ * Accounting information stored in the shadow of PAE Guest L3 pages.
+ * Because these "L3 pages" are only 32-bytes, it is inconvenient to keep
+ * various refcounts, etc., on the page_info of their page. We provide extra
+ * bookkeeping space in the shadow itself, and this is the structure
+ * definition for that bookkeeping information.
+ */
+struct pae_l3_bookkeeping {
+ u32 vcpus; /* bitmap of which vcpus are currently storing
+ * copies of this 32-byte page */
+ u32 refcount; /* refcount for this 32-byte page */
+ u8 pinned; /* is this 32-byte page pinned or not? */
+};
+
+// Convert a shadow entry pointer into a pae_l3_bookkeeping pointer.
+#define sl3p_to_info(_ptr) ((struct pae_l3_bookkeeping *) \
+ (((unsigned long)(_ptr) & ~31) + 32))
+
+static void sh_destroy_l3_subshadow(struct vcpu *v,
+ shadow_l3e_t *sl3e);
+
+/* Increment a subshadow ref
+ * Called with a pointer to the subshadow, and the mfn of the
+ * *first* page of the overall shadow. */
+static inline void sh_get_ref_l3_subshadow(shadow_l3e_t *sl3e, mfn_t smfn)
+{
+ struct pae_l3_bookkeeping *bk = sl3p_to_info(sl3e);
+
+ /* First ref to the subshadow takes a ref to the full shadow */
+ if ( bk->refcount == 0 )
+ sh_get_ref(smfn, 0);
+ if ( unlikely(++(bk->refcount) == 0) )
+ {
+ SHADOW_PRINTK("shadow l3 subshadow ref overflow, smfn=%" SH_PRI_mfn " sh=%p\n",
+ mfn_x(smfn), sl3e);
+ domain_crash_synchronous();
+ }
+}
+
+/* Decrement a subshadow ref.
+ * Called with a pointer to the subshadow, and the mfn of the
+ * *first* page of the overall shadow. Calling this may cause the
+ * entire shadow to disappear, so the caller must immediately unmap
+ * the pointer after calling. */
+static inline void sh_put_ref_l3_subshadow(struct vcpu *v,
+ shadow_l3e_t *sl3e,
+ mfn_t smfn)
+{
+ struct pae_l3_bookkeeping *bk;
+
+ bk = sl3p_to_info(sl3e);
+
+ ASSERT(bk->refcount > 0);
+ if ( --(bk->refcount) == 0 )
+ {
+ /* Need to destroy this subshadow */
+ sh_destroy_l3_subshadow(v, sl3e);
+ /* Last ref to the subshadow had a ref to the full shadow */
+ sh_put_ref(v, smfn, 0);
+ }
+}
+
+/* Pin a subshadow
+ * Called with a pointer to the subshadow, and the mfn of the
+ * *first* page of the overall shadow. */
+static inline void sh_pin_l3_subshadow(shadow_l3e_t *sl3e, mfn_t smfn)
+{
+ struct pae_l3_bookkeeping *bk = sl3p_to_info(sl3e);
+
+#if 0
+ debugtrace_printk("%s smfn=%05lx offset=%ld\n",
+ __func__, mfn_x(smfn),
+ ((unsigned long)sl3e & ~PAGE_MASK) / 64);
+#endif
+
+ if ( !bk->pinned )
+ {
+ bk->pinned = 1;
+ sh_get_ref_l3_subshadow(sl3e, smfn);
+ }
+}
+
+/* Unpin a sub-shadow.
+ * Called with a pointer to the subshadow, and the mfn of the
+ * *first* page of the overall shadow. Calling this may cause the
+ * entire shadow to disappear, so the caller must immediately unmap
+ * the pointer after calling. */
+static inline void sh_unpin_l3_subshadow(struct vcpu *v,
+ shadow_l3e_t *sl3e,
+ mfn_t smfn)
+{
+ struct pae_l3_bookkeeping *bk = sl3p_to_info(sl3e);
+
+#if 0
+ debugtrace_printk("%s smfn=%05lx offset=%ld\n",
+ __func__, mfn_x(smfn),
+ ((unsigned long)sl3e & ~PAGE_MASK) / 64);
+#endif
+
+ if ( bk->pinned )
+ {
+ bk->pinned = 0;
+ sh_put_ref_l3_subshadow(v, sl3e, smfn);
+ }
+}
+
+#endif /* GUEST_PAGING_LEVELS == 3 */
+
+#if SHADOW_PAGING_LEVELS == 3
+#define MFN_FITS_IN_HVM_CR3(_MFN) !(mfn_x(_MFN) >> 20)
+#endif
+
+#if SHADOW_PAGING_LEVELS == 2
+#define SH_PRI_pte "08x"
+#else /* SHADOW_PAGING_LEVELS >= 3 */
+#ifndef __x86_64__
+#define SH_PRI_pte "016llx"
+#else
+#define SH_PRI_pte "016lx"
+#endif
+#endif /* SHADOW_PAGING_LEVELS >= 3 */
+
+#if GUEST_PAGING_LEVELS == 2
+#define SH_PRI_gpte "08x"
+#else /* GUEST_PAGING_LEVELS >= 3 */
+#ifndef __x86_64__
+#define SH_PRI_gpte "016llx"
+#else
+#define SH_PRI_gpte "016lx"
+#endif
+#endif /* GUEST_PAGING_LEVELS >= 3 */
+
+static inline u32
+accumulate_guest_flags(walk_t *gw)
+{
+ u32 accumulated_flags;
+
+ // We accumulate the permission flags with bitwise ANDing.
+ // This works for the PRESENT bit, RW bit, and USER bit.
+ // For the NX bit, however, the polarity is wrong, so we accumulate the
+ // inverse of the NX bit.
+ //
+ accumulated_flags = guest_l1e_get_flags(gw->eff_l1e) ^ _PAGE_NX_BIT;
+ accumulated_flags &= guest_l2e_get_flags(*gw->l2e) ^ _PAGE_NX_BIT;
+
+ // Note that PAE guests do not have USER or RW or NX bits in their L3s.
+ //
+#if GUEST_PAGING_LEVELS == 3
+ accumulated_flags &=
+ ~_PAGE_PRESENT | (guest_l3e_get_flags(*gw->l3e) & _PAGE_PRESENT);
+#elif GUEST_PAGING_LEVELS >= 4
+ accumulated_flags &= guest_l3e_get_flags(*gw->l3e) ^ _PAGE_NX_BIT;
+ accumulated_flags &= guest_l4e_get_flags(*gw->l4e) ^ _PAGE_NX_BIT;
+#endif
+
+ // Finally, revert the NX bit back to its original polarity
+ accumulated_flags ^= _PAGE_NX_BIT;
+
+ return accumulated_flags;
+}
+
+#endif /* _XEN_SHADOW_TYPES_H */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
+++ /dev/null
-/******************************************************************************
- * arch/x86/shadow2-common.c
- *
- * Shadow2 code that does not need to be multiply compiled.
- * Parts of this code are Copyright (c) 2006 by XenSource Inc.
- * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
- * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#define SHADOW2 1
-
-#include <xen/config.h>
-#include <xen/types.h>
-#include <xen/mm.h>
-#include <xen/trace.h>
-#include <xen/sched.h>
-#include <xen/perfc.h>
-#include <xen/irq.h>
-#include <xen/domain_page.h>
-#include <xen/guest_access.h>
-#include <xen/keyhandler.h>
-#include <asm/event.h>
-#include <asm/page.h>
-#include <asm/current.h>
-#include <asm/flushtlb.h>
-#include <asm/shadow2.h>
-#include <asm/shadow2-private.h>
-
-#if SHADOW2_AUDIT
-int shadow2_audit_enable = 0;
-
-static void shadow2_audit_key(unsigned char key)
-{
- shadow2_audit_enable = !shadow2_audit_enable;
- printk("%s shadow2_audit_enable=%d\n",
- __func__, shadow2_audit_enable);
-}
-
-static int __init shadow2_audit_key_init(void)
-{
- register_keyhandler(
- 'O', shadow2_audit_key, "toggle shadow2 audits");
- return 0;
-}
-__initcall(shadow2_audit_key_init);
-#endif /* SHADOW2_AUDIT */
-
-static void sh2_free_log_dirty_bitmap(struct domain *d);
-
-int _shadow2_mode_refcounts(struct domain *d)
-{
- return shadow2_mode_refcounts(d);
-}
-
-
-/**************************************************************************/
-/* x86 emulator support for the shadow2 code
- */
-
-static int
-sh2_x86_emulate_read_std(unsigned long addr,
- unsigned long *val,
- unsigned int bytes,
- struct x86_emulate_ctxt *ctxt)
-{
- struct vcpu *v = current;
- if ( hvm_guest(v) )
- {
- *val = 0;
- // XXX -- this is WRONG.
- // It entirely ignores the permissions in the page tables.
- // In this case, that is only a user vs supervisor access check.
- //
- if ( hvm_copy(val, addr, bytes, HVM_COPY_IN) )
- {
-#if 0
- SHADOW2_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n",
- v->domain->domain_id, v->vcpu_id,
- addr, *val, bytes);
-#endif
- return X86EMUL_CONTINUE;
- }
-
- /* If we got here, there was nothing mapped here, or a bad GFN
- * was mapped here. This should never happen: we're here because
- * of a write fault at the end of the instruction we're emulating. */
- SHADOW2_PRINTK("read failed to va %#lx\n", addr);
- return X86EMUL_PROPAGATE_FAULT;
- }
- else
- {
- SHADOW2_PRINTK("this operation is not emulated yet\n");
- return X86EMUL_UNHANDLEABLE;
- }
-}
-
-static int
-sh2_x86_emulate_write_std(unsigned long addr,
- unsigned long val,
- unsigned int bytes,
- struct x86_emulate_ctxt *ctxt)
-{
- struct vcpu *v = current;
-#if 0
- SHADOW2_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n",
- v->domain->domain_id, v->vcpu_id, addr, val, bytes);
-#endif
- if ( hvm_guest(v) )
- {
- // XXX -- this is WRONG.
- // It entirely ignores the permissions in the page tables.
- // In this case, that includes user vs supervisor, and
- // write access.
- //
- if ( hvm_copy(&val, addr, bytes, HVM_COPY_OUT) )
- return X86EMUL_CONTINUE;
-
- /* If we got here, there was nothing mapped here, or a bad GFN
- * was mapped here. This should never happen: we're here because
- * of a write fault at the end of the instruction we're emulating,
- * which should be handled by sh2_x86_emulate_write_emulated. */
- SHADOW2_PRINTK("write failed to va %#lx\n", addr);
- return X86EMUL_PROPAGATE_FAULT;
- }
- else
- {
- SHADOW2_PRINTK("this operation is not emulated yet\n");
- return X86EMUL_UNHANDLEABLE;
- }
-}
-
-static int
-sh2_x86_emulate_write_emulated(unsigned long addr,
- unsigned long val,
- unsigned int bytes,
- struct x86_emulate_ctxt *ctxt)
-{
- struct vcpu *v = current;
-#if 0
- SHADOW2_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n",
- v->domain->domain_id, v->vcpu_id, addr, val, bytes);
-#endif
- if ( hvm_guest(v) )
- {
- return v->arch.shadow2.mode->x86_emulate_write(v, addr, &val, bytes, ctxt);
- }
- else
- {
- SHADOW2_PRINTK("this operation is not emulated yet\n");
- return X86EMUL_UNHANDLEABLE;
- }
-}
-
-static int
-sh2_x86_emulate_cmpxchg_emulated(unsigned long addr,
- unsigned long old,
- unsigned long new,
- unsigned int bytes,
- struct x86_emulate_ctxt *ctxt)
-{
- struct vcpu *v = current;
-#if 0
- SHADOW2_PRINTK("d=%u v=%u a=%#lx o?=%#lx n:=%#lx bytes=%u\n",
- v->domain->domain_id, v->vcpu_id, addr, old, new, bytes);
-#endif
- if ( hvm_guest(v) )
- {
- return v->arch.shadow2.mode->x86_emulate_cmpxchg(v, addr, old, new,
- bytes, ctxt);
- }
- else
- {
- SHADOW2_PRINTK("this operation is not emulated yet\n");
- return X86EMUL_UNHANDLEABLE;
- }
-}
-
-static int
-sh2_x86_emulate_cmpxchg8b_emulated(unsigned long addr,
- unsigned long old_lo,
- unsigned long old_hi,
- unsigned long new_lo,
- unsigned long new_hi,
- struct x86_emulate_ctxt *ctxt)
-{
- struct vcpu *v = current;
-#if 0
- SHADOW2_PRINTK("d=%u v=%u a=%#lx o?=%#lx:%lx n:=%#lx:%lx\n",
- v->domain->domain_id, v->vcpu_id, addr, old_hi, old_lo,
- new_hi, new_lo, ctxt);
-#endif
- if ( hvm_guest(v) )
- {
- return v->arch.shadow2.mode->x86_emulate_cmpxchg8b(v, addr, old_lo, old_hi,
- new_lo, new_hi, ctxt);
- }
- else
- {
- SHADOW2_PRINTK("this operation is not emulated yet\n");
- return X86EMUL_UNHANDLEABLE;
- }
-}
-
-
-struct x86_emulate_ops shadow2_emulator_ops = {
- .read_std = sh2_x86_emulate_read_std,
- .write_std = sh2_x86_emulate_write_std,
- .read_emulated = sh2_x86_emulate_read_std,
- .write_emulated = sh2_x86_emulate_write_emulated,
- .cmpxchg_emulated = sh2_x86_emulate_cmpxchg_emulated,
- .cmpxchg8b_emulated = sh2_x86_emulate_cmpxchg8b_emulated,
-};
-
-
-/**************************************************************************/
-/* Code for "promoting" a guest page to the point where the shadow code is
- * willing to let it be treated as a guest page table. This generally
- * involves making sure there are no writable mappings available to the guest
- * for this page.
- */
-void shadow2_promote(struct vcpu *v, mfn_t gmfn, u32 type)
-{
- struct page_info *page = mfn_to_page(gmfn);
- unsigned long type_info;
-
- ASSERT(valid_mfn(gmfn));
-
- /* We should never try to promote a gmfn that has writeable mappings */
- ASSERT(shadow2_remove_write_access(v, gmfn, 0, 0) == 0);
-
- // Is the page already shadowed?
- if ( !test_and_set_bit(_PGC_page_table, &page->count_info) )
- {
- // No prior shadow exists...
-
- // Grab a type-ref. We don't really care if we are racing with another
- // vcpu or not, or even what kind of type we get; we just want the type
- // count to be > 0.
- //
- do {
- type_info =
- page->u.inuse.type_info & (PGT_type_mask | PGT_va_mask);
- } while ( !get_page_type(page, type_info) );
-
- // Now that the type ref is non-zero, we can safely use the
- // shadow2_flags.
- //
- page->shadow2_flags = 0;
- }
-
- ASSERT(!test_bit(type >> PGC_SH2_type_shift, &page->shadow2_flags));
- set_bit(type >> PGC_SH2_type_shift, &page->shadow2_flags);
-}
-
-void shadow2_demote(struct vcpu *v, mfn_t gmfn, u32 type)
-{
- struct page_info *page = mfn_to_page(gmfn);
-
- ASSERT(test_bit(_PGC_page_table, &page->count_info));
- ASSERT(test_bit(type >> PGC_SH2_type_shift, &page->shadow2_flags));
-
- clear_bit(type >> PGC_SH2_type_shift, &page->shadow2_flags);
-
- if ( (page->shadow2_flags & SH2F_page_type_mask) == 0 )
- {
- // release the extra type ref
- put_page_type(page);
-
- // clear the is-a-page-table bit.
- clear_bit(_PGC_page_table, &page->count_info);
- }
-}
-
-/**************************************************************************/
-/* Validate a pagetable change from the guest and update the shadows.
- * Returns a bitmask of SHADOW2_SET_* flags. */
-
-static int
-__shadow2_validate_guest_entry(struct vcpu *v, mfn_t gmfn,
- void *entry, u32 size)
-{
- int result = 0;
- struct page_info *page = mfn_to_page(gmfn);
-
- sh2_mark_dirty(v->domain, gmfn);
-
- // Determine which types of shadows are affected, and update each.
- //
- // Always validate L1s before L2s to prevent another cpu with a linear
- // mapping of this gmfn from seeing a walk that results from
- // using the new L2 value and the old L1 value. (It is OK for such a
- // guest to see a walk that uses the old L2 value with the new L1 value,
- // as hardware could behave this way if one level of the pagewalk occurs
- // before the store, and the next level of the pagewalk occurs after the
- // store.
- //
- // Ditto for L2s before L3s, etc.
- //
-
- if ( !(page->count_info & PGC_page_table) )
- return 0; /* Not shadowed at all */
-
-#if CONFIG_PAGING_LEVELS == 2
- if ( page->shadow2_flags & SH2F_L1_32 )
- result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl1e, 2, 2)
- (v, gmfn, entry, size);
-#else
- if ( page->shadow2_flags & SH2F_L1_32 )
- result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl1e, 3, 2)
- (v, gmfn, entry, size);
-#endif
-
-#if CONFIG_PAGING_LEVELS == 2
- if ( page->shadow2_flags & SH2F_L2_32 )
- result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2e, 2, 2)
- (v, gmfn, entry, size);
-#else
- if ( page->shadow2_flags & SH2F_L2_32 )
- result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2e, 3, 2)
- (v, gmfn, entry, size);
-#endif
-
-#if CONFIG_PAGING_LEVELS >= 3
- if ( page->shadow2_flags & SH2F_L1_PAE )
- result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl1e, 3, 3)
- (v, gmfn, entry, size);
- if ( page->shadow2_flags & SH2F_L2_PAE )
- result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2e, 3, 3)
- (v, gmfn, entry, size);
- if ( page->shadow2_flags & SH2F_L2H_PAE )
- result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2he, 3, 3)
- (v, gmfn, entry, size);
- if ( page->shadow2_flags & SH2F_L3_PAE )
- result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl3e, 3, 3)
- (v, gmfn, entry, size);
-#else /* 32-bit non-PAE hypervisor does not support PAE guests */
- ASSERT((page->shadow2_flags & (SH2F_L3_PAE|SH2F_L2_PAE|SH2F_L1_PAE)) == 0);
-#endif
-
-#if CONFIG_PAGING_LEVELS >= 4
- if ( page->shadow2_flags & SH2F_L1_64 )
- result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl1e, 4, 4)
- (v, gmfn, entry, size);
- if ( page->shadow2_flags & SH2F_L2_64 )
- result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2e, 4, 4)
- (v, gmfn, entry, size);
- if ( page->shadow2_flags & SH2F_L3_64 )
- result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl3e, 4, 4)
- (v, gmfn, entry, size);
- if ( page->shadow2_flags & SH2F_L4_64 )
- result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl4e, 4, 4)
- (v, gmfn, entry, size);
-#else /* 32-bit/PAE hypervisor does not support 64-bit guests */
- ASSERT((page->shadow2_flags
- & (SH2F_L4_64|SH2F_L3_64|SH2F_L2_64|SH2F_L1_64)) == 0);
-#endif
-
- return result;
-}
-
-
-int
-shadow2_validate_guest_entry(struct vcpu *v, mfn_t gmfn, void *entry)
-/* This is the entry point from hypercalls. It returns a bitmask of all the
- * results of shadow_set_l*e() calls, so the caller knows to do TLB flushes. */
-{
- int rc;
-
- ASSERT(shadow2_lock_is_acquired(v->domain));
- rc = __shadow2_validate_guest_entry(v, gmfn, entry, sizeof(l1_pgentry_t));
- shadow2_audit_tables(v);
- return rc;
-}
-
-void
-shadow2_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn,
- void *entry, u32 size)
-/* This is the entry point for emulated writes to pagetables in HVM guests */
-{
- struct domain *d = v->domain;
- int rc;
-
- ASSERT(shadow2_lock_is_acquired(v->domain));
- rc = __shadow2_validate_guest_entry(v, gmfn, entry, size);
- if ( rc & SHADOW2_SET_FLUSH )
- {
- // Flush everyone except the local processor, which will flush when it
- // re-enters the HVM guest.
- //
- cpumask_t mask = d->domain_dirty_cpumask;
- cpu_clear(v->processor, mask);
- flush_tlb_mask(mask);
- }
- if ( rc & SHADOW2_SET_ERROR )
- {
- /* This page is probably not a pagetable any more: tear it out of the
- * shadows, along with any tables that reference it */
- shadow2_remove_all_shadows_and_parents(v, gmfn);
- }
- /* We ignore the other bits: since we are about to change CR3 on
- * VMENTER we don't need to do any extra TLB flushes. */
-}
-
-
-/**************************************************************************/
-/* Memory management for shadow pages. */
-
-/* Meaning of the count_info field in shadow pages
- * ----------------------------------------------
- *
- * A count of all references to this page from other shadow pages and
- * guest CR3s (a.k.a. v->arch.shadow2.table).
- *
- * The top bits hold the shadow type and the pinned bit. Top-level
- * shadows are pinned so that they don't disappear when not in a CR3
- * somewhere.
- *
- * We don't need to use get|put_page for this as the updates are all
- * protected by the shadow lock. We can't use get|put_page for this
- * as the size of the count on shadow pages is different from that on
- * normal guest pages.
- */
-
-/* Meaning of the type_info field in shadow pages
- * ----------------------------------------------
- *
- * type_info use depends on the shadow type (from count_info)
- *
- * PGC_SH2_none : This page is in the shadow2 free pool. type_info holds
- * the chunk order for our freelist allocator.
- *
- * PGC_SH2_l*_shadow : This page is in use as a shadow. type_info
- * holds the mfn of the guest page being shadowed,
- *
- * PGC_SH2_fl1_*_shadow : This page is being used to shatter a superpage.
- * type_info holds the gfn being shattered.
- *
- * PGC_SH2_monitor_table : This page is part of a monitor table.
- * type_info is not used.
- */
-
-/* Meaning of the _domain field in shadow pages
- * --------------------------------------------
- *
- * In shadow pages, this field will always have its least significant bit
- * set. This ensures that all attempts to get_page() will fail (as all
- * valid pickled domain pointers have a zero for their least significant bit).
- * Instead, the remaining upper bits are used to record the shadow generation
- * counter when the shadow was created.
- */
-
-/* Meaning of the shadow2_flags field
- * ----------------------------------
- *
- * In guest pages that are shadowed, one bit for each kind of shadow they have.
- *
- * In shadow pages, will be used for holding a representation of the populated
- * entries in this shadow (either a min/max, or a bitmap, or ...)
- *
- * In monitor-table pages, holds the level of the particular page (to save
- * spilling the shadow types into an extra bit by having three types of monitor
- * page).
- */
-
-/* Meaning of the list_head struct in shadow pages
- * -----------------------------------------------
- *
- * In free shadow pages, this is used to hold the free-lists of chunks.
- *
- * In top-level shadow tables, this holds a linked-list of all top-level
- * shadows (used for recovering memory and destroying shadows).
- *
- * In lower-level shadows, this holds the physical address of a higher-level
- * shadow entry that holds a reference to this shadow (or zero).
- */
-
-/* Allocating shadow pages
- * -----------------------
- *
- * Most shadow pages are allocated singly, but there are two cases where we
- * need to allocate multiple pages together.
- *
- * 1: Shadowing 32-bit guest tables on PAE or 64-bit shadows.
- * A 32-bit guest l1 table covers 4MB of virtuial address space,
- * and needs to be shadowed by two PAE/64-bit l1 tables (covering 2MB
- * of virtual address space each). Similarly, a 32-bit guest l2 table
- * (4GB va) needs to be shadowed by four PAE/64-bit l2 tables (1GB va
- * each). These multi-page shadows are contiguous and aligned;
- * functions for handling offsets into them are defined in shadow2.c
- * (shadow_l1_index() etc.)
- *
- * 2: Shadowing PAE top-level pages. Each guest page that contains
- * any PAE top-level pages requires two shadow pages to shadow it.
- * They contain alternating l3 tables and pae_l3_bookkeeping structs.
- *
- * This table shows the allocation behaviour of the different modes:
- *
- * Xen paging 32b pae pae 64b 64b 64b
- * Guest paging 32b 32b pae 32b pae 64b
- * PV or HVM * HVM * HVM HVM *
- * Shadow paging 32b pae pae pae pae 64b
- *
- * sl1 size 4k 8k 4k 8k 4k 4k
- * sl2 size 4k 16k 4k 16k 4k 4k
- * sl3 size - - 8k - 8k 4k
- * sl4 size - - - - - 4k
- *
- * We allocate memory from xen in four-page units and break them down
- * with a simple buddy allocator. Can't use the xen allocator to handle
- * this as it only works for contiguous zones, and a domain's shadow
- * pool is made of fragments.
- *
- * In HVM guests, the p2m table is built out of shadow pages, and we provide
- * a function for the p2m management to steal pages, in max-order chunks, from
- * the free pool. We don't provide for giving them back, yet.
- */
-
-/* Figure out the least acceptable quantity of shadow memory.
- * The minimum memory requirement for always being able to free up a
- * chunk of memory is very small -- only three max-order chunks per
- * vcpu to hold the top level shadows and pages with Xen mappings in them.
- *
- * But for a guest to be guaranteed to successfully execute a single
- * instruction, we must be able to map a large number (about thirty) VAs
- * at the same time, which means that to guarantee progress, we must
- * allow for more than ninety allocated pages per vcpu. We round that
- * up to 128 pages, or half a megabyte per vcpu. */
-unsigned int shadow2_min_acceptable_pages(struct domain *d)
-{
- u32 vcpu_count = 0;
- struct vcpu *v;
-
- for_each_vcpu(d, v)
- vcpu_count++;
-
- return (vcpu_count * 128);
-}
-
-/* Using the type_info field to store freelist order */
-#define SH2_PFN_ORDER(_p) ((_p)->u.inuse.type_info)
-#define SH2_SET_PFN_ORDER(_p, _o) \
- do { (_p)->u.inuse.type_info = (_o); } while (0)
-
-
-/* Figure out the order of allocation needed for a given shadow type */
-static inline u32
-shadow_order(u32 shadow_type)
-{
-#if CONFIG_PAGING_LEVELS > 2
- static const u32 type_to_order[16] = {
- 0, /* PGC_SH2_none */
- 1, /* PGC_SH2_l1_32_shadow */
- 1, /* PGC_SH2_fl1_32_shadow */
- 2, /* PGC_SH2_l2_32_shadow */
- 0, /* PGC_SH2_l1_pae_shadow */
- 0, /* PGC_SH2_fl1_pae_shadow */
- 0, /* PGC_SH2_l2_pae_shadow */
- 0, /* PGC_SH2_l2h_pae_shadow */
- 1, /* PGC_SH2_l3_pae_shadow */
- 0, /* PGC_SH2_l1_64_shadow */
- 0, /* PGC_SH2_fl1_64_shadow */
- 0, /* PGC_SH2_l2_64_shadow */
- 0, /* PGC_SH2_l3_64_shadow */
- 0, /* PGC_SH2_l4_64_shadow */
- 2, /* PGC_SH2_p2m_table */
- 0 /* PGC_SH2_monitor_table */
- };
- u32 type = (shadow_type & PGC_SH2_type_mask) >> PGC_SH2_type_shift;
- return type_to_order[type];
-#else /* 32-bit Xen only ever shadows 32-bit guests on 32-bit shadows. */
- return 0;
-#endif
-}
-
-
-/* Do we have a free chunk of at least this order? */
-static inline int chunk_is_available(struct domain *d, int order)
-{
- int i;
-
- for ( i = order; i <= SHADOW2_MAX_ORDER; i++ )
- if ( !list_empty(&d->arch.shadow2.freelists[i]) )
- return 1;
- return 0;
-}
-
-/* Dispatcher function: call the per-mode function that will unhook the
- * non-Xen mappings in this top-level shadow mfn */
-void shadow2_unhook_mappings(struct vcpu *v, mfn_t smfn)
-{
- struct page_info *pg = mfn_to_page(smfn);
- switch ( (pg->count_info & PGC_SH2_type_mask) >> PGC_SH2_type_shift )
- {
- case PGC_SH2_l2_32_shadow >> PGC_SH2_type_shift:
-#if CONFIG_PAGING_LEVELS == 2
- SHADOW2_INTERNAL_NAME(sh2_unhook_32b_mappings,2,2)(v,smfn);
-#else
- SHADOW2_INTERNAL_NAME(sh2_unhook_32b_mappings,3,2)(v,smfn);
-#endif
- break;
-#if CONFIG_PAGING_LEVELS >= 3
- case PGC_SH2_l3_pae_shadow >> PGC_SH2_type_shift:
- SHADOW2_INTERNAL_NAME(sh2_unhook_pae_mappings,3,3)(v,smfn);
- break;
-#endif
-#if CONFIG_PAGING_LEVELS >= 4
- case PGC_SH2_l4_64_shadow >> PGC_SH2_type_shift:
- SHADOW2_INTERNAL_NAME(sh2_unhook_64b_mappings,4,4)(v,smfn);
- break;
-#endif
- default:
- SHADOW2_PRINTK("top-level shadow has bad type %08lx\n",
- (unsigned long)((pg->count_info & PGC_SH2_type_mask)
- >> PGC_SH2_type_shift));
- BUG();
- }
-}
-
-
-/* Make sure there is at least one chunk of the required order available
- * in the shadow page pool. This must be called before any calls to
- * shadow2_alloc(). Since this will free existing shadows to make room,
- * it must be called early enough to avoid freeing shadows that the
- * caller is currently working on. */
-void shadow2_prealloc(struct domain *d, unsigned int order)
-{
- /* Need a vpcu for calling unpins; for now, since we don't have
- * per-vcpu shadows, any will do */
- struct vcpu *v = d->vcpu[0];
- struct list_head *l, *t;
- struct page_info *pg;
- mfn_t smfn;
-
- if ( chunk_is_available(d, order) ) return;
-
- /* Stage one: walk the list of top-level pages, unpinning them */
- perfc_incrc(shadow2_prealloc_1);
- list_for_each_backwards_safe(l, t, &d->arch.shadow2.toplevel_shadows)
- {
- pg = list_entry(l, struct page_info, list);
- smfn = page_to_mfn(pg);
-
-#if CONFIG_PAGING_LEVELS >= 3
- if ( (pg->count_info & PGC_SH2_type_mask) == PGC_SH2_l3_pae_shadow )
- {
- /* For PAE, we need to unpin each subshadow on this shadow */
- SHADOW2_INTERNAL_NAME(sh2_unpin_all_l3_subshadows,3,3)(v, smfn);
- }
- else
-#endif /* 32-bit code always takes this branch */
- {
- /* Unpin this top-level shadow */
- sh2_unpin(v, smfn);
- }
-
- /* See if that freed up a chunk of appropriate size */
- if ( chunk_is_available(d, order) ) return;
- }
-
- /* Stage two: all shadow pages are in use in hierarchies that are
- * loaded in cr3 on some vcpu. Walk them, unhooking the non-Xen
- * mappings. */
- perfc_incrc(shadow2_prealloc_2);
- v = current;
- if ( v->domain != d )
- v = d->vcpu[0];
- /* Walk the list from the tail: recently used toplevels have been pulled
- * to the head */
- list_for_each_backwards_safe(l, t, &d->arch.shadow2.toplevel_shadows)
- {
- pg = list_entry(l, struct page_info, list);
- smfn = page_to_mfn(pg);
- shadow2_unhook_mappings(v, smfn);
-
- /* Need to flush TLB if we've altered our own tables */
- if ( !shadow2_mode_external(d)
- && pagetable_get_pfn(current->arch.shadow_table) == mfn_x(smfn) )
- local_flush_tlb();
-
- /* See if that freed up a chunk of appropriate size */
- if ( chunk_is_available(d, order) ) return;
- }
-
- /* Nothing more we can do: all remaining shadows are of pages that
- * hold Xen mappings for some vcpu. This can never happen. */
- SHADOW2_PRINTK("Can't pre-allocate %i shadow pages!\n"
- " shadow pages total = %u, free = %u, p2m=%u\n",
- 1 << order,
- d->arch.shadow2.total_pages,
- d->arch.shadow2.free_pages,
- d->arch.shadow2.p2m_pages);
- BUG();
-}
-
-
-/* Allocate another shadow's worth of (contiguous, aligned) pages,
- * and fill in the type and backpointer fields of their page_infos.
- * Never fails to allocate. */
-mfn_t shadow2_alloc(struct domain *d,
- u32 shadow_type,
- unsigned long backpointer)
-{
- struct page_info *pg = NULL;
- unsigned int order = shadow_order(shadow_type);
- cpumask_t mask;
- void *p;
- int i;
-
- ASSERT(shadow2_lock_is_acquired(d));
- ASSERT(order <= SHADOW2_MAX_ORDER);
- ASSERT(shadow_type != PGC_SH2_none);
- perfc_incrc(shadow2_alloc);
-
- /* Find smallest order which can satisfy the request. */
- for ( i = order; i <= SHADOW2_MAX_ORDER; i++ )
- if ( !list_empty(&d->arch.shadow2.freelists[i]) )
- {
- pg = list_entry(d->arch.shadow2.freelists[i].next,
- struct page_info, list);
- list_del(&pg->list);
-
- /* We may have to halve the chunk a number of times. */
- while ( i != order )
- {
- i--;
- SH2_SET_PFN_ORDER(pg, i);
- list_add_tail(&pg->list, &d->arch.shadow2.freelists[i]);
- pg += 1 << i;
- }
- d->arch.shadow2.free_pages -= 1 << order;
-
- /* Init page info fields and clear the pages */
- for ( i = 0; i < 1<<order ; i++ )
- {
- pg[i].u.inuse.type_info = backpointer;
- pg[i].count_info = shadow_type;
- pg[i].shadow2_flags = 0;
- INIT_LIST_HEAD(&pg[i].list);
- /* Before we overwrite the old contents of this page,
- * we need to be sure that no TLB holds a pointer to it. */
- mask = d->domain_dirty_cpumask;
- tlbflush_filter(mask, pg[i].tlbflush_timestamp);
- if ( unlikely(!cpus_empty(mask)) )
- {
- perfc_incrc(shadow2_alloc_tlbflush);
- flush_tlb_mask(mask);
- }
- /* Now safe to clear the page for reuse */
- p = sh2_map_domain_page(page_to_mfn(pg+i));
- ASSERT(p != NULL);
- clear_page(p);
- sh2_unmap_domain_page(p);
- perfc_incr(shadow2_alloc_count);
- }
- return page_to_mfn(pg);
- }
-
- /* If we get here, we failed to allocate. This should never happen.
- * It means that we didn't call shadow2_prealloc() correctly before
- * we allocated. We can't recover by calling prealloc here, because
- * we might free up higher-level pages that the caller is working on. */
- SHADOW2_PRINTK("Can't allocate %i shadow pages!\n", 1 << order);
- BUG();
-}
-
-
-/* Return some shadow pages to the pool. */
-void shadow2_free(struct domain *d, mfn_t smfn)
-{
- struct page_info *pg = mfn_to_page(smfn);
- u32 shadow_type;
- unsigned long order;
- unsigned long mask;
- int i;
-
- ASSERT(shadow2_lock_is_acquired(d));
- perfc_incrc(shadow2_free);
-
- shadow_type = pg->count_info & PGC_SH2_type_mask;
- ASSERT(shadow_type != PGC_SH2_none);
- ASSERT(shadow_type != PGC_SH2_p2m_table);
- order = shadow_order(shadow_type);
-
- d->arch.shadow2.free_pages += 1 << order;
-
- for ( i = 0; i < 1<<order; i++ )
- {
- /* Strip out the type: this is now a free shadow page */
- pg[i].count_info = 0;
- /* Remember the TLB timestamp so we will know whether to flush
- * TLBs when we reuse the page. Because the destructors leave the
- * contents of the pages in place, we can delay TLB flushes until
- * just before the allocator hands the page out again. */
- pg[i].tlbflush_timestamp = tlbflush_current_time();
- perfc_decr(shadow2_alloc_count);
- }
-
- /* Merge chunks as far as possible. */
- while ( order < SHADOW2_MAX_ORDER )
- {
- mask = 1 << order;
- if ( (mfn_x(page_to_mfn(pg)) & mask) ) {
- /* Merge with predecessor block? */
- if ( (((pg-mask)->count_info & PGC_SH2_type_mask) != PGT_none)
- || (SH2_PFN_ORDER(pg-mask) != order) )
- break;
- list_del(&(pg-mask)->list);
- pg -= mask;
- } else {
- /* Merge with successor block? */
- if ( (((pg+mask)->count_info & PGC_SH2_type_mask) != PGT_none)
- || (SH2_PFN_ORDER(pg+mask) != order) )
- break;
- list_del(&(pg+mask)->list);
- }
- order++;
- }
-
- SH2_SET_PFN_ORDER(pg, order);
- list_add_tail(&pg->list, &d->arch.shadow2.freelists[order]);
-}
-
-/* Divert some memory from the pool to be used by the p2m mapping.
- * This action is irreversible: the p2m mapping only ever grows.
- * That's OK because the p2m table only exists for external domains,
- * and those domains can't ever turn off shadow mode.
- * Also, we only ever allocate a max-order chunk, so as to preserve
- * the invariant that shadow2_prealloc() always works.
- * Returns 0 iff it can't get a chunk (the caller should then
- * free up some pages in domheap and call set_sh2_allocation);
- * returns non-zero on success.
- */
-static int
-shadow2_alloc_p2m_pages(struct domain *d)
-{
- struct page_info *pg;
- u32 i;
- ASSERT(shadow2_lock_is_acquired(d));
-
- if ( d->arch.shadow2.total_pages
- < (shadow2_min_acceptable_pages(d) + (1<<SHADOW2_MAX_ORDER)) )
- return 0; /* Not enough shadow memory: need to increase it first */
-
- pg = mfn_to_page(shadow2_alloc(d, PGC_SH2_p2m_table, 0));
- d->arch.shadow2.p2m_pages += (1<<SHADOW2_MAX_ORDER);
- d->arch.shadow2.total_pages -= (1<<SHADOW2_MAX_ORDER);
- for (i = 0; i < (1<<SHADOW2_MAX_ORDER); i++)
- {
- /* Unlike shadow pages, mark p2m pages as owned by the domain */
- page_set_owner(&pg[i], d);
- list_add_tail(&pg[i].list, &d->arch.shadow2.p2m_freelist);
- }
- return 1;
-}
-
-// Returns 0 if no memory is available...
-mfn_t
-shadow2_alloc_p2m_page(struct domain *d)
-{
- struct list_head *entry;
- mfn_t mfn;
- void *p;
-
- if ( list_empty(&d->arch.shadow2.p2m_freelist) &&
- !shadow2_alloc_p2m_pages(d) )
- return _mfn(0);
- entry = d->arch.shadow2.p2m_freelist.next;
- list_del(entry);
- list_add_tail(entry, &d->arch.shadow2.p2m_inuse);
- mfn = page_to_mfn(list_entry(entry, struct page_info, list));
- sh2_get_ref(mfn, 0);
- p = sh2_map_domain_page(mfn);
- clear_page(p);
- sh2_unmap_domain_page(p);
-
- return mfn;
-}
-
-#if CONFIG_PAGING_LEVELS == 3
-static void p2m_install_entry_in_monitors(struct domain *d,
- l3_pgentry_t *l3e)
-/* Special case, only used for external-mode domains on PAE hosts:
- * update the mapping of the p2m table. Once again, this is trivial in
- * other paging modes (one top-level entry points to the top-level p2m,
- * no maintenance needed), but PAE makes life difficult by needing a
- * copy the eight l3es of the p2m table in eight l2h slots in the
- * monitor table. This function makes fresh copies when a p2m l3e
- * changes. */
-{
- l2_pgentry_t *ml2e;
- struct vcpu *v;
- unsigned int index;
-
- index = ((unsigned long)l3e & ~PAGE_MASK) / sizeof(l3_pgentry_t);
- ASSERT(index < MACHPHYS_MBYTES>>1);
-
- for_each_vcpu(d, v)
- {
- if ( pagetable_get_pfn(v->arch.monitor_table) == 0 )
- continue;
- ASSERT(shadow2_mode_external(v->domain));
-
- SHADOW2_DEBUG(P2M, "d=%u v=%u index=%u mfn=%#lx\n",
- d->domain_id, v->vcpu_id, index, l3e_get_pfn(*l3e));
-
- if ( v == current ) /* OK to use linear map of monitor_table */
- ml2e = __linear_l2_table + l2_linear_offset(RO_MPT_VIRT_START);
- else
- {
- l3_pgentry_t *ml3e;
- ml3e = sh2_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
- ASSERT(l3e_get_flags(ml3e[3]) & _PAGE_PRESENT);
- ml2e = sh2_map_domain_page(_mfn(l3e_get_pfn(ml3e[3])));
- ml2e += l2_table_offset(RO_MPT_VIRT_START);
- sh2_unmap_domain_page(ml3e);
- }
- ml2e[index] = l2e_from_pfn(l3e_get_pfn(*l3e), __PAGE_HYPERVISOR);
- if ( v != current )
- sh2_unmap_domain_page(ml2e);
- }
-}
-#endif
-
-// Find the next level's P2M entry, checking for out-of-range gfn's...
-// Returns NULL on error.
-//
-static l1_pgentry_t *
-p2m_find_entry(void *table, unsigned long *gfn_remainder,
- unsigned long gfn, u32 shift, u32 max)
-{
- u32 index;
-
- index = *gfn_remainder >> shift;
- if ( index >= max )
- {
- SHADOW2_DEBUG(P2M, "gfn=0x%lx out of range "
- "(gfn_remainder=0x%lx shift=%d index=0x%x max=0x%x)\n",
- gfn, *gfn_remainder, shift, index, max);
- return NULL;
- }
- *gfn_remainder &= (1 << shift) - 1;
- return (l1_pgentry_t *)table + index;
-}
-
-// Walk one level of the P2M table, allocating a new table if required.
-// Returns 0 on error.
-//
-static int
-p2m_next_level(struct domain *d, mfn_t *table_mfn, void **table,
- unsigned long *gfn_remainder, unsigned long gfn, u32 shift,
- u32 max, unsigned long type)
-{
- l1_pgentry_t *p2m_entry;
- void *next;
-
- if ( !(p2m_entry = p2m_find_entry(*table, gfn_remainder, gfn,
- shift, max)) )
- return 0;
-
- if ( !(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) )
- {
- mfn_t mfn = shadow2_alloc_p2m_page(d);
- if ( mfn_x(mfn) == 0 )
- return 0;
- *p2m_entry = l1e_from_pfn(mfn_x(mfn), __PAGE_HYPERVISOR|_PAGE_USER);
- mfn_to_page(mfn)->u.inuse.type_info = type | 1 | PGT_validated;
- mfn_to_page(mfn)->count_info = 1;
-#if CONFIG_PAGING_LEVELS == 3
- if (type == PGT_l2_page_table)
- {
- /* We have written to the p2m l3: need to sync the per-vcpu
- * copies of it in the monitor tables */
- p2m_install_entry_in_monitors(d, (l3_pgentry_t *)p2m_entry);
- }
-#endif
- /* The P2M can be shadowed: keep the shadows synced */
- if ( d->vcpu[0] )
- (void)__shadow2_validate_guest_entry(d->vcpu[0], *table_mfn,
- p2m_entry, sizeof *p2m_entry);
- }
- *table_mfn = _mfn(l1e_get_pfn(*p2m_entry));
- next = sh2_map_domain_page(*table_mfn);
- sh2_unmap_domain_page(*table);
- *table = next;
-
- return 1;
-}
-
-// Returns 0 on error (out of memory)
-int
-shadow2_set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn)
-{
- // XXX -- this might be able to be faster iff current->domain == d
- mfn_t table_mfn = pagetable_get_mfn(d->arch.phys_table);
- void *table = sh2_map_domain_page(table_mfn);
- unsigned long gfn_remainder = gfn;
- l1_pgentry_t *p2m_entry;
-
-#if CONFIG_PAGING_LEVELS >= 4
- if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
- L4_PAGETABLE_SHIFT - PAGE_SHIFT,
- L4_PAGETABLE_ENTRIES, PGT_l3_page_table) )
- return 0;
-#endif
-#if CONFIG_PAGING_LEVELS >= 3
- // When using PAE Xen, we only allow 33 bits of pseudo-physical
- // address in translated guests (i.e. 8 GBytes). This restriction
- // comes from wanting to map the P2M table into the 16MB RO_MPT hole
- // in Xen's address space for translated PV guests.
- //
- if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
- L3_PAGETABLE_SHIFT - PAGE_SHIFT,
- (CONFIG_PAGING_LEVELS == 3
- ? 8
- : L3_PAGETABLE_ENTRIES),
- PGT_l2_page_table) )
- return 0;
-#endif
- if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
- L2_PAGETABLE_SHIFT - PAGE_SHIFT,
- L2_PAGETABLE_ENTRIES, PGT_l1_page_table) )
- return 0;
-
- p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
- 0, L1_PAGETABLE_ENTRIES);
- ASSERT(p2m_entry);
- if ( valid_mfn(mfn) )
- *p2m_entry = l1e_from_pfn(mfn_x(mfn), __PAGE_HYPERVISOR|_PAGE_USER);
- else
- *p2m_entry = l1e_empty();
-
- /* The P2M can be shadowed: keep the shadows synced */
- (void) __shadow2_validate_guest_entry(d->vcpu[0], table_mfn,
- p2m_entry, sizeof *p2m_entry);
-
- sh2_unmap_domain_page(table);
-
- return 1;
-}
-
-// Allocate a new p2m table for a domain.
-//
-// The structure of the p2m table is that of a pagetable for xen (i.e. it is
-// controlled by CONFIG_PAGING_LEVELS).
-//
-// Returns 0 if p2m table could not be initialized
-//
-static int
-shadow2_alloc_p2m_table(struct domain *d)
-{
- mfn_t p2m_top;
- struct list_head *entry;
- unsigned int page_count = 0;
-
- SHADOW2_PRINTK("allocating p2m table\n");
- ASSERT(pagetable_get_pfn(d->arch.phys_table) == 0);
-
- p2m_top = shadow2_alloc_p2m_page(d);
- mfn_to_page(p2m_top)->count_info = 1;
- mfn_to_page(p2m_top)->u.inuse.type_info =
-#if CONFIG_PAGING_LEVELS == 4
- PGT_l4_page_table
-#elif CONFIG_PAGING_LEVELS == 3
- PGT_l3_page_table
-#elif CONFIG_PAGING_LEVELS == 2
- PGT_l2_page_table
-#endif
- | 1 | PGT_validated;
-
- if ( mfn_x(p2m_top) == 0 )
- return 0;
-
- d->arch.phys_table = pagetable_from_mfn(p2m_top);
-
- SHADOW2_PRINTK("populating p2m table\n");
-
- for ( entry = d->page_list.next;
- entry != &d->page_list;
- entry = entry->next )
- {
- struct page_info *page = list_entry(entry, struct page_info, list);
- mfn_t mfn = page_to_mfn(page);
- unsigned long gfn = get_gpfn_from_mfn(mfn_x(mfn));
- page_count++;
- if (
-#ifdef __x86_64__
- (gfn != 0x5555555555555555L)
-#else
- (gfn != 0x55555555L)
-#endif
- && gfn != INVALID_M2P_ENTRY
- && !shadow2_set_p2m_entry(d, gfn, mfn) )
- {
- SHADOW2_PRINTK("failed to initialize p2m table, gfn=%05lx, mfn=%" SH2_PRI_mfn "\n",
- gfn, mfn_x(mfn));
- return 0;
- }
- }
-
- SHADOW2_PRINTK("p2m table initialised (%u pages)\n", page_count);
- return 1;
-}
-
-mfn_t
-sh2_gfn_to_mfn_foreign(struct domain *d, unsigned long gpfn)
-/* Read another domain's p2m entries */
-{
- mfn_t mfn;
- unsigned long addr = gpfn << PAGE_SHIFT;
- l2_pgentry_t *l2e;
- l1_pgentry_t *l1e;
-
- ASSERT(shadow2_mode_translate(d));
- mfn = pagetable_get_mfn(d->arch.phys_table);
-
-
-#if CONFIG_PAGING_LEVELS > 2
- if ( gpfn > (RO_MPT_VIRT_END - RO_MPT_VIRT_START) / sizeof(l1_pgentry_t) )
- /* This pfn is higher than the p2m map can hold */
- return _mfn(INVALID_MFN);
-#endif
-
-
-#if CONFIG_PAGING_LEVELS >= 4
- {
- l4_pgentry_t *l4e = sh2_map_domain_page(mfn);
- l4e += l4_table_offset(addr);
- if ( (l4e_get_flags(*l4e) & _PAGE_PRESENT) == 0 )
- {
- sh2_unmap_domain_page(l4e);
- return _mfn(INVALID_MFN);
- }
- mfn = _mfn(l4e_get_pfn(*l4e));
- sh2_unmap_domain_page(l4e);
- }
-#endif
-#if CONFIG_PAGING_LEVELS >= 3
- {
- l3_pgentry_t *l3e = sh2_map_domain_page(mfn);
- l3e += l3_table_offset(addr);
- if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 )
- {
- sh2_unmap_domain_page(l3e);
- return _mfn(INVALID_MFN);
- }
- mfn = _mfn(l3e_get_pfn(*l3e));
- sh2_unmap_domain_page(l3e);
- }
-#endif
-
- l2e = sh2_map_domain_page(mfn);
- l2e += l2_table_offset(addr);
- if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 )
- {
- sh2_unmap_domain_page(l2e);
- return _mfn(INVALID_MFN);
- }
- mfn = _mfn(l2e_get_pfn(*l2e));
- sh2_unmap_domain_page(l2e);
-
- l1e = sh2_map_domain_page(mfn);
- l1e += l1_table_offset(addr);
- if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 )
- {
- sh2_unmap_domain_page(l1e);
- return _mfn(INVALID_MFN);
- }
- mfn = _mfn(l1e_get_pfn(*l1e));
- sh2_unmap_domain_page(l1e);
-
- return mfn;
-}
-
-unsigned long
-shadow2_gfn_to_mfn_foreign(unsigned long gpfn)
-{
- return mfn_x(sh2_gfn_to_mfn_foreign(current->domain, gpfn));
-}
-
-
-static void shadow2_p2m_teardown(struct domain *d)
-/* Return all the p2m pages to Xen.
- * We know we don't have any extra mappings to these pages */
-{
- struct list_head *entry, *n;
- struct page_info *pg;
-
- d->arch.phys_table = pagetable_null();
-
- list_for_each_safe(entry, n, &d->arch.shadow2.p2m_inuse)
- {
- pg = list_entry(entry, struct page_info, list);
- list_del(entry);
- /* Should have just the one ref we gave it in alloc_p2m_page() */
- if ( (pg->count_info & PGC_SH2_count_mask) != 1 )
- {
- SHADOW2_PRINTK("Odd p2m page count c=%#x t=%"PRtype_info"\n",
- pg->count_info, pg->u.inuse.type_info);
- }
- ASSERT(page_get_owner(pg) == d);
- /* Free should not decrement domain's total allocation, since
- * these pages were allocated without an owner. */
- page_set_owner(pg, NULL);
- free_domheap_pages(pg, 0);
- d->arch.shadow2.p2m_pages--;
- perfc_decr(shadow2_alloc_count);
- }
- list_for_each_safe(entry, n, &d->arch.shadow2.p2m_freelist)
- {
- list_del(entry);
- pg = list_entry(entry, struct page_info, list);
- ASSERT(page_get_owner(pg) == d);
- /* Free should not decrement domain's total allocation. */
- page_set_owner(pg, NULL);
- free_domheap_pages(pg, 0);
- d->arch.shadow2.p2m_pages--;
- perfc_decr(shadow2_alloc_count);
- }
- ASSERT(d->arch.shadow2.p2m_pages == 0);
-}
-
-/* Set the pool of shadow pages to the required number of pages.
- * Input will be rounded up to at least shadow2_min_acceptable_pages(),
- * plus space for the p2m table.
- * Returns 0 for success, non-zero for failure. */
-static unsigned int set_sh2_allocation(struct domain *d,
- unsigned int pages,
- int *preempted)
-{
- struct page_info *pg;
- unsigned int lower_bound;
- int j;
-
- ASSERT(shadow2_lock_is_acquired(d));
-
- /* Don't allocate less than the minimum acceptable, plus one page per
- * megabyte of RAM (for the p2m table) */
- lower_bound = shadow2_min_acceptable_pages(d) + (d->tot_pages / 256);
- if ( pages > 0 && pages < lower_bound )
- pages = lower_bound;
- /* Round up to largest block size */
- pages = (pages + ((1<<SHADOW2_MAX_ORDER)-1)) & ~((1<<SHADOW2_MAX_ORDER)-1);
-
- SHADOW2_PRINTK("current %i target %i\n",
- d->arch.shadow2.total_pages, pages);
-
- while ( d->arch.shadow2.total_pages != pages )
- {
- if ( d->arch.shadow2.total_pages < pages )
- {
- /* Need to allocate more memory from domheap */
- pg = alloc_domheap_pages(NULL, SHADOW2_MAX_ORDER, 0);
- if ( pg == NULL )
- {
- SHADOW2_PRINTK("failed to allocate shadow pages.\n");
- return -ENOMEM;
- }
- d->arch.shadow2.free_pages += 1<<SHADOW2_MAX_ORDER;
- d->arch.shadow2.total_pages += 1<<SHADOW2_MAX_ORDER;
- for ( j = 0; j < 1<<SHADOW2_MAX_ORDER; j++ )
- {
- pg[j].u.inuse.type_info = 0; /* Free page */
- pg[j].tlbflush_timestamp = 0; /* Not in any TLB */
- }
- SH2_SET_PFN_ORDER(pg, SHADOW2_MAX_ORDER);
- list_add_tail(&pg->list,
- &d->arch.shadow2.freelists[SHADOW2_MAX_ORDER]);
- }
- else if ( d->arch.shadow2.total_pages > pages )
- {
- /* Need to return memory to domheap */
- shadow2_prealloc(d, SHADOW2_MAX_ORDER);
- ASSERT(!list_empty(&d->arch.shadow2.freelists[SHADOW2_MAX_ORDER]));
- pg = list_entry(d->arch.shadow2.freelists[SHADOW2_MAX_ORDER].next,
- struct page_info, list);
- list_del(&pg->list);
- d->arch.shadow2.free_pages -= 1<<SHADOW2_MAX_ORDER;
- d->arch.shadow2.total_pages -= 1<<SHADOW2_MAX_ORDER;
- free_domheap_pages(pg, SHADOW2_MAX_ORDER);
- }
-
- /* Check to see if we need to yield and try again */
- if ( preempted && hypercall_preempt_check() )
- {
- *preempted = 1;
- return 0;
- }
- }
-
- return 0;
-}
-
-unsigned int shadow2_set_allocation(struct domain *d,
- unsigned int megabytes,
- int *preempted)
-/* Hypercall interface to set the shadow memory allocation */
-{
- unsigned int rv;
- shadow2_lock(d);
- rv = set_sh2_allocation(d, megabytes << (20 - PAGE_SHIFT), preempted);
- SHADOW2_PRINTK("dom %u allocation now %u pages (%u MB)\n",
- d->domain_id,
- d->arch.shadow2.total_pages,
- shadow2_get_allocation(d));
- shadow2_unlock(d);
- return rv;
-}
-
-/**************************************************************************/
-/* Hash table for storing the guest->shadow mappings */
-
-/* Hash function that takes a gfn or mfn, plus another byte of type info */
-typedef u32 key_t;
-static inline key_t sh2_hash(unsigned long n, u8 t)
-{
- unsigned char *p = (unsigned char *)&n;
- key_t k = t;
- int i;
- for ( i = 0; i < sizeof(n) ; i++ ) k = (u32)p[i] + (k<<6) + (k<<16) - k;
- return k;
-}
-
-#if SHADOW2_AUDIT & (SHADOW2_AUDIT_HASH|SHADOW2_AUDIT_HASH_FULL)
-
-/* Before we get to the mechanism, define a pair of audit functions
- * that sanity-check the contents of the hash table. */
-static void sh2_hash_audit_bucket(struct domain *d, int bucket)
-/* Audit one bucket of the hash table */
-{
- struct shadow2_hash_entry *e, *x;
- struct page_info *pg;
-
- if ( !(SHADOW2_AUDIT_ENABLE) )
- return;
-
- e = &d->arch.shadow2.hash_table[bucket];
- if ( e->t == 0 ) return; /* Bucket is empty */
- while ( e )
- {
- /* Empty link? */
- BUG_ON( e->t == 0 );
- /* Bogus type? */
- BUG_ON( e->t > (PGC_SH2_max_shadow >> PGC_SH2_type_shift) );
- /* Wrong bucket? */
- BUG_ON( sh2_hash(e->n, e->t) % SHADOW2_HASH_BUCKETS != bucket );
- /* Duplicate entry? */
- for ( x = e->next; x; x = x->next )
- BUG_ON( x->n == e->n && x->t == e->t );
- /* Bogus MFN? */
- BUG_ON( !valid_mfn(e->smfn) );
- pg = mfn_to_page(e->smfn);
- /* Not a shadow? */
- BUG_ON( page_get_owner(pg) != 0 );
- /* Wrong kind of shadow? */
- BUG_ON( (pg->count_info & PGC_SH2_type_mask) >> PGC_SH2_type_shift
- != e->t );
- /* Bad backlink? */
- BUG_ON( pg->u.inuse.type_info != e->n );
- if ( e->t != (PGC_SH2_fl1_32_shadow >> PGC_SH2_type_shift)
- && e->t != (PGC_SH2_fl1_pae_shadow >> PGC_SH2_type_shift)
- && e->t != (PGC_SH2_fl1_64_shadow >> PGC_SH2_type_shift) )
- {
- /* Bad shadow flags on guest page? */
- BUG_ON( !(mfn_to_page(_mfn(e->n))->shadow2_flags & (1<<e->t)) );
- }
- /* That entry was OK; on we go */
- e = e->next;
- }
-}
-
-#else
-#define sh2_hash_audit_bucket(_d, _b)
-#endif /* Hashtable bucket audit */
-
-
-#if SHADOW2_AUDIT & SHADOW2_AUDIT_HASH_FULL
-
-static void sh2_hash_audit(struct domain *d)
-/* Full audit: audit every bucket in the table */
-{
- int i;
-
- if ( !(SHADOW2_AUDIT_ENABLE) )
- return;
-
- for ( i = 0; i < SHADOW2_HASH_BUCKETS; i++ )
- {
- sh2_hash_audit_bucket(d, i);
- }
-}
-
-#else
-#define sh2_hash_audit(_d)
-#endif /* Hashtable bucket audit */
-
-/* Memory management interface for bucket allocation.
- * These ought to come out of shadow memory, but at least on 32-bit
- * machines we are forced to allocate them from xenheap so that we can
- * address them. */
-static struct shadow2_hash_entry *sh2_alloc_hash_entry(struct domain *d)
-{
- struct shadow2_hash_entry *extra, *x;
- int i;
-
- /* We need to allocate a new node. Ensure the free list is not empty.
- * Allocate new entries in units the same size as the original table. */
- if ( unlikely(d->arch.shadow2.hash_freelist == NULL) )
- {
- size_t sz = sizeof(void *) + (SHADOW2_HASH_BUCKETS * sizeof(*x));
- extra = xmalloc_bytes(sz);
-
- if ( extra == NULL )
- {
- /* No memory left! */
- SHADOW2_ERROR("xmalloc() failed when allocating hash buckets.\n");
- domain_crash_synchronous();
- }
- memset(extra, 0, sz);
-
- /* Record the allocation block so it can be correctly freed later. */
- *((struct shadow2_hash_entry **)&extra[SHADOW2_HASH_BUCKETS]) =
- d->arch.shadow2.hash_allocations;
- d->arch.shadow2.hash_allocations = &extra[0];
-
- /* Thread a free chain through the newly-allocated nodes. */
- for ( i = 0; i < (SHADOW2_HASH_BUCKETS - 1); i++ )
- extra[i].next = &extra[i+1];
- extra[i].next = NULL;
-
- /* Add the new nodes to the free list. */
- d->arch.shadow2.hash_freelist = &extra[0];
- }
-
- /* Allocate a new node from the free list. */
- x = d->arch.shadow2.hash_freelist;
- d->arch.shadow2.hash_freelist = x->next;
- return x;
-}
-
-static void sh2_free_hash_entry(struct domain *d, struct shadow2_hash_entry *e)
-{
- /* Mark the bucket as empty and return it to the free list */
- e->t = 0;
- e->next = d->arch.shadow2.hash_freelist;
- d->arch.shadow2.hash_freelist = e;
-}
-
-
-/* Allocate and initialise the table itself.
- * Returns 0 for success, 1 for error. */
-static int shadow2_hash_alloc(struct domain *d)
-{
- struct shadow2_hash_entry *table;
-
- ASSERT(shadow2_lock_is_acquired(d));
- ASSERT(!d->arch.shadow2.hash_table);
-
- table = xmalloc_array(struct shadow2_hash_entry, SHADOW2_HASH_BUCKETS);
- if ( !table ) return 1;
- memset(table, 0,
- SHADOW2_HASH_BUCKETS * sizeof (struct shadow2_hash_entry));
- d->arch.shadow2.hash_table = table;
- return 0;
-}
-
-/* Tear down the hash table and return all memory to Xen.
- * This function does not care whether the table is populated. */
-static void shadow2_hash_teardown(struct domain *d)
-{
- struct shadow2_hash_entry *a, *n;
-
- ASSERT(shadow2_lock_is_acquired(d));
- ASSERT(d->arch.shadow2.hash_table);
-
- /* Return the table itself */
- xfree(d->arch.shadow2.hash_table);
- d->arch.shadow2.hash_table = NULL;
-
- /* Return any extra allocations */
- a = d->arch.shadow2.hash_allocations;
- while ( a )
- {
- /* We stored a linked-list pointer at the end of each allocation */
- n = *((struct shadow2_hash_entry **)(&a[SHADOW2_HASH_BUCKETS]));
- xfree(a);
- a = n;
- }
- d->arch.shadow2.hash_allocations = NULL;
- d->arch.shadow2.hash_freelist = NULL;
-}
-
-
-mfn_t shadow2_hash_lookup(struct vcpu *v, unsigned long n, u8 t)
-/* Find an entry in the hash table. Returns the MFN of the shadow,
- * or INVALID_MFN if it doesn't exist */
-{
- struct domain *d = v->domain;
- struct shadow2_hash_entry *p, *x, *head;
- key_t key;
-
- ASSERT(shadow2_lock_is_acquired(d));
- ASSERT(d->arch.shadow2.hash_table);
- ASSERT(t);
-
- sh2_hash_audit(d);
-
- perfc_incrc(shadow2_hash_lookups);
- key = sh2_hash(n, t);
-
- x = head = &d->arch.shadow2.hash_table[key % SHADOW2_HASH_BUCKETS];
- p = NULL;
-
- sh2_hash_audit_bucket(d, key % SHADOW2_HASH_BUCKETS);
-
- do
- {
- ASSERT(x->t || ((x == head) && (x->next == NULL)));
-
- if ( x->n == n && x->t == t )
- {
- /* Pull-to-front if 'x' isn't already the head item */
- if ( unlikely(x != head) )
- {
- if ( unlikely(d->arch.shadow2.hash_walking != 0) )
- /* Can't reorder: someone is walking the hash chains */
- return x->smfn;
- else
- {
- /* Delete 'x' from list and reinsert after head. */
- p->next = x->next;
- x->next = head->next;
- head->next = x;
-
- /* Swap 'x' contents with head contents. */
- SWAP(head->n, x->n);
- SWAP(head->t, x->t);
- SWAP(head->smfn, x->smfn);
- }
- }
- else
- {
- perfc_incrc(shadow2_hash_lookup_head);
- }
- return head->smfn;
- }
-
- p = x;
- x = x->next;
- }
- while ( x != NULL );
-
- perfc_incrc(shadow2_hash_lookup_miss);
- return _mfn(INVALID_MFN);
-}
-
-void shadow2_hash_insert(struct vcpu *v, unsigned long n, u8 t, mfn_t smfn)
-/* Put a mapping (n,t)->smfn into the hash table */
-{
- struct domain *d = v->domain;
- struct shadow2_hash_entry *x, *head;
- key_t key;
-
- ASSERT(shadow2_lock_is_acquired(d));
- ASSERT(d->arch.shadow2.hash_table);
- ASSERT(t);
-
- sh2_hash_audit(d);
-
- perfc_incrc(shadow2_hash_inserts);
- key = sh2_hash(n, t);
-
- head = &d->arch.shadow2.hash_table[key % SHADOW2_HASH_BUCKETS];
-
- sh2_hash_audit_bucket(d, key % SHADOW2_HASH_BUCKETS);
-
- /* If the bucket is empty then insert the new page as the head item. */
- if ( head->t == 0 )
- {
- head->n = n;
- head->t = t;
- head->smfn = smfn;
- ASSERT(head->next == NULL);
- }
- else
- {
- /* Insert a new entry directly after the head item. */
- x = sh2_alloc_hash_entry(d);
- x->n = n;
- x->t = t;
- x->smfn = smfn;
- x->next = head->next;
- head->next = x;
- }
-
- sh2_hash_audit_bucket(d, key % SHADOW2_HASH_BUCKETS);
-}
-
-void shadow2_hash_delete(struct vcpu *v, unsigned long n, u8 t, mfn_t smfn)
-/* Excise the mapping (n,t)->smfn from the hash table */
-{
- struct domain *d = v->domain;
- struct shadow2_hash_entry *p, *x, *head;
- key_t key;
-
- ASSERT(shadow2_lock_is_acquired(d));
- ASSERT(d->arch.shadow2.hash_table);
- ASSERT(t);
-
- sh2_hash_audit(d);
-
- perfc_incrc(shadow2_hash_deletes);
- key = sh2_hash(n, t);
-
- head = &d->arch.shadow2.hash_table[key % SHADOW2_HASH_BUCKETS];
-
- sh2_hash_audit_bucket(d, key % SHADOW2_HASH_BUCKETS);
-
- /* Match on head item? */
- if ( head->n == n && head->t == t )
- {
- if ( (x = head->next) != NULL )
- {
- /* Overwrite head with contents of following node. */
- head->n = x->n;
- head->t = x->t;
- head->smfn = x->smfn;
-
- /* Delete following node. */
- head->next = x->next;
- sh2_free_hash_entry(d, x);
- }
- else
- {
- /* This bucket is now empty. Initialise the head node. */
- head->t = 0;
- }
- }
- else
- {
- /* Not at the head; need to walk the chain */
- p = head;
- x = head->next;
-
- while(1)
- {
- ASSERT(x); /* We can't have hit the end, since our target is
- * still in the chain somehwere... */
- if ( x->n == n && x->t == t )
- {
- /* Delete matching node. */
- p->next = x->next;
- sh2_free_hash_entry(d, x);
- break;
- }
- p = x;
- x = x->next;
- }
- }
-
- sh2_hash_audit_bucket(d, key % SHADOW2_HASH_BUCKETS);
-}
-
-typedef int (*hash_callback_t)(struct vcpu *v, mfn_t smfn, mfn_t other_mfn);
-
-static void hash_foreach(struct vcpu *v,
- unsigned int callback_mask,
- hash_callback_t callbacks[],
- mfn_t callback_mfn)
-/* Walk the hash table looking at the types of the entries and
- * calling the appropriate callback function for each entry.
- * The mask determines which shadow types we call back for, and the array
- * of callbacks tells us which function to call.
- * Any callback may return non-zero to let us skip the rest of the scan.
- *
- * WARNING: Callbacks MUST NOT add or remove hash entries unless they
- * then return non-zero to terminate the scan. */
-{
- int i, done = 0;
- struct domain *d = v->domain;
- struct shadow2_hash_entry *x;
-
- /* Say we're here, to stop hash-lookups reordering the chains */
- ASSERT(shadow2_lock_is_acquired(d));
- ASSERT(d->arch.shadow2.hash_walking == 0);
- d->arch.shadow2.hash_walking = 1;
-
- callback_mask &= ~1; /* Never attempt to call back on empty buckets */
- for ( i = 0; i < SHADOW2_HASH_BUCKETS; i++ )
- {
- /* WARNING: This is not safe against changes to the hash table.
- * The callback *must* return non-zero if it has inserted or
- * deleted anything from the hash (lookups are OK, though). */
- for ( x = &d->arch.shadow2.hash_table[i]; x; x = x->next )
- {
- if ( callback_mask & (1 << x->t) )
- {
- ASSERT(x->t <= 15);
- ASSERT(callbacks[x->t] != NULL);
- if ( (done = callbacks[x->t](v, x->smfn, callback_mfn)) != 0 )
- break;
- }
- }
- if ( done ) break;
- }
- d->arch.shadow2.hash_walking = 0;
-}
-
-
-/**************************************************************************/
-/* Destroy a shadow page: simple dispatcher to call the per-type destructor
- * which will decrement refcounts appropriately and return memory to the
- * free pool. */
-
-void sh2_destroy_shadow(struct vcpu *v, mfn_t smfn)
-{
- struct page_info *pg = mfn_to_page(smfn);
- u32 t = pg->count_info & PGC_SH2_type_mask;
-
-
- SHADOW2_PRINTK("smfn=%#lx\n", mfn_x(smfn));
-
- /* Double-check, if we can, that the shadowed page belongs to this
- * domain, (by following the back-pointer). */
- ASSERT(t == PGC_SH2_fl1_32_shadow ||
- t == PGC_SH2_fl1_pae_shadow ||
- t == PGC_SH2_fl1_64_shadow ||
- t == PGC_SH2_monitor_table ||
- (page_get_owner(mfn_to_page(_mfn(pg->u.inuse.type_info)))
- == v->domain));
-
- /* The down-shifts here are so that the switch statement is on nice
- * small numbers that the compiler will enjoy */
- switch ( t >> PGC_SH2_type_shift )
- {
-#if CONFIG_PAGING_LEVELS == 2
- case PGC_SH2_l1_32_shadow >> PGC_SH2_type_shift:
- case PGC_SH2_fl1_32_shadow >> PGC_SH2_type_shift:
- SHADOW2_INTERNAL_NAME(sh2_destroy_l1_shadow, 2, 2)(v, smfn);
- break;
- case PGC_SH2_l2_32_shadow >> PGC_SH2_type_shift:
- SHADOW2_INTERNAL_NAME(sh2_destroy_l2_shadow, 2, 2)(v, smfn);
- break;
-#else /* PAE or 64bit */
- case PGC_SH2_l1_32_shadow >> PGC_SH2_type_shift:
- case PGC_SH2_fl1_32_shadow >> PGC_SH2_type_shift:
- SHADOW2_INTERNAL_NAME(sh2_destroy_l1_shadow, 3, 2)(v, smfn);
- break;
- case PGC_SH2_l2_32_shadow >> PGC_SH2_type_shift:
- SHADOW2_INTERNAL_NAME(sh2_destroy_l2_shadow, 3, 2)(v, smfn);
- break;
-#endif
-
-#if CONFIG_PAGING_LEVELS >= 3
- case PGC_SH2_l1_pae_shadow >> PGC_SH2_type_shift:
- case PGC_SH2_fl1_pae_shadow >> PGC_SH2_type_shift:
- SHADOW2_INTERNAL_NAME(sh2_destroy_l1_shadow, 3, 3)(v, smfn);
- break;
- case PGC_SH2_l2_pae_shadow >> PGC_SH2_type_shift:
- case PGC_SH2_l2h_pae_shadow >> PGC_SH2_type_shift:
- SHADOW2_INTERNAL_NAME(sh2_destroy_l2_shadow, 3, 3)(v, smfn);
- break;
- case PGC_SH2_l3_pae_shadow >> PGC_SH2_type_shift:
- SHADOW2_INTERNAL_NAME(sh2_destroy_l3_shadow, 3, 3)(v, smfn);
- break;
-#endif
-
-#if CONFIG_PAGING_LEVELS >= 4
- case PGC_SH2_l1_64_shadow >> PGC_SH2_type_shift:
- case PGC_SH2_fl1_64_shadow >> PGC_SH2_type_shift:
- SHADOW2_INTERNAL_NAME(sh2_destroy_l1_shadow, 4, 4)(v, smfn);
- break;
- case PGC_SH2_l2_64_shadow >> PGC_SH2_type_shift:
- SHADOW2_INTERNAL_NAME(sh2_destroy_l2_shadow, 4, 4)(v, smfn);
- break;
- case PGC_SH2_l3_64_shadow >> PGC_SH2_type_shift:
- SHADOW2_INTERNAL_NAME(sh2_destroy_l3_shadow, 4, 4)(v, smfn);
- break;
- case PGC_SH2_l4_64_shadow >> PGC_SH2_type_shift:
- SHADOW2_INTERNAL_NAME(sh2_destroy_l4_shadow, 4, 4)(v, smfn);
- break;
-#endif
- default:
- SHADOW2_PRINTK("tried to destroy shadow of bad type %08lx\n",
- (unsigned long)t);
- BUG();
- }
-}
-
-/**************************************************************************/
-/* Remove all writeable mappings of a guest frame from the shadow tables
- * Returns non-zero if we need to flush TLBs.
- * level and fault_addr desribe how we found this to be a pagetable;
- * level==0 means we have some other reason for revoking write access.*/
-
-int shadow2_remove_write_access(struct vcpu *v, mfn_t gmfn,
- unsigned int level,
- unsigned long fault_addr)
-{
- /* Dispatch table for getting per-type functions */
- static hash_callback_t callbacks[16] = {
- NULL, /* none */
-#if CONFIG_PAGING_LEVELS == 2
- SHADOW2_INTERNAL_NAME(sh2_remove_write_access,2,2), /* l1_32 */
- SHADOW2_INTERNAL_NAME(sh2_remove_write_access,2,2), /* fl1_32 */
-#else
- SHADOW2_INTERNAL_NAME(sh2_remove_write_access,3,2), /* l1_32 */
- SHADOW2_INTERNAL_NAME(sh2_remove_write_access,3,2), /* fl1_32 */
-#endif
- NULL, /* l2_32 */
-#if CONFIG_PAGING_LEVELS >= 3
- SHADOW2_INTERNAL_NAME(sh2_remove_write_access,3,3), /* l1_pae */
- SHADOW2_INTERNAL_NAME(sh2_remove_write_access,3,3), /* fl1_pae */
-#else
- NULL, /* l1_pae */
- NULL, /* fl1_pae */
-#endif
- NULL, /* l2_pae */
- NULL, /* l2h_pae */
- NULL, /* l3_pae */
-#if CONFIG_PAGING_LEVELS >= 4
- SHADOW2_INTERNAL_NAME(sh2_remove_write_access,4,4), /* l1_64 */
- SHADOW2_INTERNAL_NAME(sh2_remove_write_access,4,4), /* fl1_64 */
-#else
- NULL, /* l1_64 */
- NULL, /* fl1_64 */
-#endif
- NULL, /* l2_64 */
- NULL, /* l3_64 */
- NULL, /* l4_64 */
- NULL, /* p2m */
- NULL /* unused */
- };
-
- static unsigned int callback_mask =
- 1 << (PGC_SH2_l1_32_shadow >> PGC_SH2_type_shift)
- | 1 << (PGC_SH2_fl1_32_shadow >> PGC_SH2_type_shift)
- | 1 << (PGC_SH2_l1_pae_shadow >> PGC_SH2_type_shift)
- | 1 << (PGC_SH2_fl1_pae_shadow >> PGC_SH2_type_shift)
- | 1 << (PGC_SH2_l1_64_shadow >> PGC_SH2_type_shift)
- | 1 << (PGC_SH2_fl1_64_shadow >> PGC_SH2_type_shift)
- ;
- struct page_info *pg = mfn_to_page(gmfn);
-
- ASSERT(shadow2_lock_is_acquired(v->domain));
-
- /* Only remove writable mappings if we are doing shadow refcounts.
- * In guest refcounting, we trust Xen to already be restricting
- * all the writes to the guest page tables, so we do not need to
- * do more. */
- if ( !shadow2_mode_refcounts(v->domain) )
- return 0;
-
- /* Early exit if it's already a pagetable, or otherwise not writeable */
- if ( sh2_mfn_is_a_page_table(gmfn)
- || (pg->u.inuse.type_info & PGT_count_mask) == 0 )
- return 0;
-
- perfc_incrc(shadow2_writeable);
-
- /* If this isn't a "normal" writeable page, the domain is trying to
- * put pagetables in special memory of some kind. We can't allow that. */
- if ( (pg->u.inuse.type_info & PGT_type_mask) != PGT_writable_page )
- {
- SHADOW2_ERROR("can't remove write access to mfn %lx, type_info is %"
- PRtype_info "\n",
- mfn_x(gmfn), mfn_to_page(gmfn)->u.inuse.type_info);
- domain_crash(v->domain);
- }
-
-#if SHADOW2_OPTIMIZATIONS & SH2OPT_WRITABLE_HEURISTIC
- if ( v == current && level != 0 )
- {
- unsigned long gfn;
- /* Heuristic: there is likely to be only one writeable mapping,
- * and that mapping is likely to be in the current pagetable,
- * either in the guest's linear map (linux, windows) or in a
- * magic slot used to map high memory regions (linux HIGHTPTE) */
-
-#define GUESS(_a, _h) do { \
- if ( v->arch.shadow2.mode->guess_wrmap(v, (_a), gmfn) ) \
- perfc_incrc(shadow2_writeable_h_ ## _h); \
- if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 ) \
- return 1; \
- } while (0)
-
-
- /* Linux lowmem: first 1GB is mapped 1-to-1 above 0xC0000000 */
- if ( v == current
- && (gfn = sh2_mfn_to_gfn(v->domain, gmfn)) < 0x40000000 )
- GUESS(0xC0000000 + (gfn << PAGE_SHIFT), 4);
-
- if ( v->arch.shadow2.mode->guest_levels == 2 )
- {
- if ( level == 1 )
- /* 32bit non-PAE w2k3: linear map at 0xC0000000 */
- GUESS(0xC0000000UL + (fault_addr >> 10), 1);
- }
-#if CONFIG_PAGING_LEVELS >= 3
- else if ( v->arch.shadow2.mode->guest_levels == 3 )
- {
- /* 32bit PAE w2k3: linear map at 0xC0000000 */
- switch ( level )
- {
- case 1: GUESS(0xC0000000UL + (fault_addr >> 9), 2); break;
- case 2: GUESS(0xC0600000UL + (fault_addr >> 18), 2); break;
- }
- }
-#if CONFIG_PAGING_LEVELS >= 4
- else if ( v->arch.shadow2.mode->guest_levels == 4 )
- {
- /* 64bit w2k3: linear map at 0x0000070000000000 */
- switch ( level )
- {
- case 1: GUESS(0x70000000000UL + (fault_addr >> 9), 3); break;
- case 2: GUESS(0x70380000000UL + (fault_addr >> 18), 3); break;
- case 3: GUESS(0x70381C00000UL + (fault_addr >> 27), 3); break;
- }
- }
-#endif /* CONFIG_PAGING_LEVELS >= 4 */
-#endif /* CONFIG_PAGING_LEVELS >= 3 */
-
-#undef GUESS
-
- }
-#endif
-
- /* Brute-force search of all the shadows, by walking the hash */
- perfc_incrc(shadow2_writeable_bf);
- hash_foreach(v, callback_mask, callbacks, gmfn);
-
- /* If that didn't catch the mapping, something is very wrong */
- if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) != 0 )
- {
- SHADOW2_ERROR("can't find all writeable mappings of mfn %lx: "
- "%lu left\n", mfn_x(gmfn),
- (mfn_to_page(gmfn)->u.inuse.type_info&PGT_count_mask));
- domain_crash(v->domain);
- }
-
- /* We killed at least one writeable mapping, so must flush TLBs. */
- return 1;
-}
-
-
-
-/**************************************************************************/
-/* Remove all mappings of a guest frame from the shadow tables.
- * Returns non-zero if we need to flush TLBs. */
-
-int shadow2_remove_all_mappings(struct vcpu *v, mfn_t gmfn)
-{
- struct page_info *page = mfn_to_page(gmfn);
- int expected_count;
-
- /* Dispatch table for getting per-type functions */
- static hash_callback_t callbacks[16] = {
- NULL, /* none */
-#if CONFIG_PAGING_LEVELS == 2
- SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,2,2), /* l1_32 */
- SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,2,2), /* fl1_32 */
-#else
- SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,3,2), /* l1_32 */
- SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,3,2), /* fl1_32 */
-#endif
- NULL, /* l2_32 */
-#if CONFIG_PAGING_LEVELS >= 3
- SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,3,3), /* l1_pae */
- SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,3,3), /* fl1_pae */
-#else
- NULL, /* l1_pae */
- NULL, /* fl1_pae */
-#endif
- NULL, /* l2_pae */
- NULL, /* l2h_pae */
- NULL, /* l3_pae */
-#if CONFIG_PAGING_LEVELS >= 4
- SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,4,4), /* l1_64 */
- SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,4,4), /* fl1_64 */
-#else
- NULL, /* l1_64 */
- NULL, /* fl1_64 */
-#endif
- NULL, /* l2_64 */
- NULL, /* l3_64 */
- NULL, /* l4_64 */
- NULL, /* p2m */
- NULL /* unused */
- };
-
- static unsigned int callback_mask =
- 1 << (PGC_SH2_l1_32_shadow >> PGC_SH2_type_shift)
- | 1 << (PGC_SH2_fl1_32_shadow >> PGC_SH2_type_shift)
- | 1 << (PGC_SH2_l1_pae_shadow >> PGC_SH2_type_shift)
- | 1 << (PGC_SH2_fl1_pae_shadow >> PGC_SH2_type_shift)
- | 1 << (PGC_SH2_l1_64_shadow >> PGC_SH2_type_shift)
- | 1 << (PGC_SH2_fl1_64_shadow >> PGC_SH2_type_shift)
- ;
-
- perfc_incrc(shadow2_mappings);
- if ( (page->count_info & PGC_count_mask) == 0 )
- return 0;
-
- ASSERT(shadow2_lock_is_acquired(v->domain));
-
- /* XXX TODO:
- * Heuristics for finding the (probably) single mapping of this gmfn */
-
- /* Brute-force search of all the shadows, by walking the hash */
- perfc_incrc(shadow2_mappings_bf);
- hash_foreach(v, callback_mask, callbacks, gmfn);
-
- /* If that didn't catch the mapping, something is very wrong */
- expected_count = (page->count_info & PGC_allocated) ? 1 : 0;
- if ( (page->count_info & PGC_count_mask) != expected_count )
- {
- /* Don't complain if we're in HVM and there's one extra mapping:
- * The qemu helper process has an untyped mapping of this dom's RAM */
- if ( !(shadow2_mode_external(v->domain)
- && (page->count_info & PGC_count_mask) <= 2
- && (page->u.inuse.type_info & PGT_count_mask) == 0) )
- {
- SHADOW2_ERROR("can't find all mappings of mfn %lx: "
- "c=%08x t=%08lx\n", mfn_x(gmfn),
- page->count_info, page->u.inuse.type_info);
- }
- }
-
- /* We killed at least one mapping, so must flush TLBs. */
- return 1;
-}
-
-
-/**************************************************************************/
-/* Remove all shadows of a guest frame from the shadow tables */
-
-static int sh2_remove_shadow_via_pointer(struct vcpu *v, mfn_t smfn)
-/* Follow this shadow's up-pointer, if it has one, and remove the reference
- * found there. Returns 1 if that was the only reference to this shadow */
-{
- struct page_info *pg = mfn_to_page(smfn);
- mfn_t pmfn;
- void *vaddr;
- int rc;
-
- ASSERT((pg->count_info & PGC_SH2_type_mask) > 0);
- ASSERT((pg->count_info & PGC_SH2_type_mask) < PGC_SH2_max_shadow);
- ASSERT((pg->count_info & PGC_SH2_type_mask) != PGC_SH2_l2_32_shadow);
- ASSERT((pg->count_info & PGC_SH2_type_mask) != PGC_SH2_l3_pae_shadow);
- ASSERT((pg->count_info & PGC_SH2_type_mask) != PGC_SH2_l4_64_shadow);
-
- if (pg->up == 0) return 0;
- pmfn = _mfn(pg->up >> PAGE_SHIFT);
- ASSERT(valid_mfn(pmfn));
- vaddr = sh2_map_domain_page(pmfn);
- ASSERT(vaddr);
- vaddr += pg->up & (PAGE_SIZE-1);
- ASSERT(l1e_get_pfn(*(l1_pgentry_t *)vaddr) == mfn_x(smfn));
-
- /* Is this the only reference to this shadow? */
- rc = ((pg->count_info & PGC_SH2_count_mask) == 1) ? 1 : 0;
-
- /* Blank the offending entry */
- switch ((pg->count_info & PGC_SH2_type_mask))
- {
- case PGC_SH2_l1_32_shadow:
- case PGC_SH2_l2_32_shadow:
-#if CONFIG_PAGING_LEVELS == 2
- SHADOW2_INTERNAL_NAME(sh2_clear_shadow_entry,2,2)(v, vaddr, pmfn);
-#else
- SHADOW2_INTERNAL_NAME(sh2_clear_shadow_entry,3,2)(v, vaddr, pmfn);
-#endif
- break;
-#if CONFIG_PAGING_LEVELS >=3
- case PGC_SH2_l1_pae_shadow:
- case PGC_SH2_l2_pae_shadow:
- case PGC_SH2_l2h_pae_shadow:
- case PGC_SH2_l3_pae_shadow:
- SHADOW2_INTERNAL_NAME(sh2_clear_shadow_entry,3,3)(v, vaddr, pmfn);
- break;
-#if CONFIG_PAGING_LEVELS >= 4
- case PGC_SH2_l1_64_shadow:
- case PGC_SH2_l2_64_shadow:
- case PGC_SH2_l3_64_shadow:
- case PGC_SH2_l4_64_shadow:
- SHADOW2_INTERNAL_NAME(sh2_clear_shadow_entry,4,4)(v, vaddr, pmfn);
- break;
-#endif
-#endif
- default: BUG(); /* Some wierd unknown shadow type */
- }
-
- sh2_unmap_domain_page(vaddr);
- if ( rc )
- perfc_incrc(shadow2_up_pointer);
- else
- perfc_incrc(shadow2_unshadow_bf);
-
- return rc;
-}
-
-void sh2_remove_shadows(struct vcpu *v, mfn_t gmfn, int all)
-/* Remove the shadows of this guest page.
- * If all != 0, find all shadows, if necessary by walking the tables.
- * Otherwise, just try the (much faster) heuristics, which will remove
- * at most one reference to each shadow of the page. */
-{
- struct page_info *pg;
- mfn_t smfn;
- u32 sh_flags;
- unsigned char t;
-
- /* Dispatch table for getting per-type functions: each level must
- * be called with the function to remove a lower-level shadow. */
- static hash_callback_t callbacks[16] = {
- NULL, /* none */
- NULL, /* l1_32 */
- NULL, /* fl1_32 */
-#if CONFIG_PAGING_LEVELS == 2
- SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow,2,2), /* l2_32 */
-#else
- SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow,3,2), /* l2_32 */
-#endif
- NULL, /* l1_pae */
- NULL, /* fl1_pae */
-#if CONFIG_PAGING_LEVELS >= 3
- SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow,3,3), /* l2_pae */
- SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow,3,3), /* l2h_pae */
- SHADOW2_INTERNAL_NAME(sh2_remove_l2_shadow,3,3), /* l3_pae */
-#else
- NULL, /* l2_pae */
- NULL, /* l2h_pae */
- NULL, /* l3_pae */
-#endif
- NULL, /* l1_64 */
- NULL, /* fl1_64 */
-#if CONFIG_PAGING_LEVELS >= 4
- SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow,4,4), /* l2_64 */
- SHADOW2_INTERNAL_NAME(sh2_remove_l2_shadow,4,4), /* l3_64 */
- SHADOW2_INTERNAL_NAME(sh2_remove_l3_shadow,4,4), /* l4_64 */
-#else
- NULL, /* l2_64 */
- NULL, /* l3_64 */
- NULL, /* l4_64 */
-#endif
- NULL, /* p2m */
- NULL /* unused */
- };
-
- /* Another lookup table, for choosing which mask to use */
- static unsigned int masks[16] = {
- 0, /* none */
- 1 << (PGC_SH2_l2_32_shadow >> PGC_SH2_type_shift), /* l1_32 */
- 0, /* fl1_32 */
- 0, /* l2_32 */
- ((1 << (PGC_SH2_l2h_pae_shadow >> PGC_SH2_type_shift))
- | (1 << (PGC_SH2_l2_pae_shadow >> PGC_SH2_type_shift))), /* l1_pae */
- 0, /* fl1_pae */
- 1 << (PGC_SH2_l3_pae_shadow >> PGC_SH2_type_shift), /* l2_pae */
- 1 << (PGC_SH2_l3_pae_shadow >> PGC_SH2_type_shift), /* l2h_pae */
- 0, /* l3_pae */
- 1 << (PGC_SH2_l2_64_shadow >> PGC_SH2_type_shift), /* l1_64 */
- 0, /* fl1_64 */
- 1 << (PGC_SH2_l3_64_shadow >> PGC_SH2_type_shift), /* l2_64 */
- 1 << (PGC_SH2_l4_64_shadow >> PGC_SH2_type_shift), /* l3_64 */
- 0, /* l4_64 */
- 0, /* p2m */
- 0 /* unused */
- };
-
- ASSERT(shadow2_lock_is_acquired(v->domain));
-
- pg = mfn_to_page(gmfn);
-
- /* Bale out now if the page is not shadowed */
- if ( (pg->count_info & PGC_page_table) == 0 )
- return;
-
- SHADOW2_PRINTK("d=%d, v=%d, gmfn=%05lx\n",
- v->domain->domain_id, v->vcpu_id, mfn_x(gmfn));
-
- /* Search for this shadow in all appropriate shadows */
- perfc_incrc(shadow2_unshadow);
- sh_flags = pg->shadow2_flags;
-
- /* Lower-level shadows need to be excised from upper-level shadows.
- * This call to hash_foreach() looks dangerous but is in fact OK: each
- * call will remove at most one shadow, and terminate immediately when
- * it does remove it, so we never walk the hash after doing a deletion. */
-#define DO_UNSHADOW(_type) do { \
- t = (_type) >> PGC_SH2_type_shift; \
- smfn = shadow2_hash_lookup(v, mfn_x(gmfn), t); \
- if ( !sh2_remove_shadow_via_pointer(v, smfn) && all ) \
- hash_foreach(v, masks[t], callbacks, smfn); \
-} while (0)
-
- /* Top-level shadows need to be unpinned */
-#define DO_UNPIN(_type) do { \
- t = (_type) >> PGC_SH2_type_shift; \
- smfn = shadow2_hash_lookup(v, mfn_x(gmfn), t); \
- if ( mfn_to_page(smfn)->count_info & PGC_SH2_pinned ) \
- sh2_unpin(v, smfn); \
- if ( (_type) == PGC_SH2_l3_pae_shadow ) \
- SHADOW2_INTERNAL_NAME(sh2_unpin_all_l3_subshadows,3,3)(v, smfn); \
-} while (0)
-
- if ( sh_flags & SH2F_L1_32 ) DO_UNSHADOW(PGC_SH2_l1_32_shadow);
- if ( sh_flags & SH2F_L2_32 ) DO_UNPIN(PGC_SH2_l2_32_shadow);
-#if CONFIG_PAGING_LEVELS >= 3
- if ( sh_flags & SH2F_L1_PAE ) DO_UNSHADOW(PGC_SH2_l1_pae_shadow);
- if ( sh_flags & SH2F_L2_PAE ) DO_UNSHADOW(PGC_SH2_l2_pae_shadow);
- if ( sh_flags & SH2F_L2H_PAE ) DO_UNSHADOW(PGC_SH2_l2h_pae_shadow);
- if ( sh_flags & SH2F_L3_PAE ) DO_UNPIN(PGC_SH2_l3_pae_shadow);
-#if CONFIG_PAGING_LEVELS >= 4
- if ( sh_flags & SH2F_L1_64 ) DO_UNSHADOW(PGC_SH2_l1_64_shadow);
- if ( sh_flags & SH2F_L2_64 ) DO_UNSHADOW(PGC_SH2_l2_64_shadow);
- if ( sh_flags & SH2F_L3_64 ) DO_UNSHADOW(PGC_SH2_l3_64_shadow);
- if ( sh_flags & SH2F_L4_64 ) DO_UNPIN(PGC_SH2_l4_64_shadow);
-#endif
-#endif
-
-#undef DO_UNSHADOW
-#undef DO_UNPIN
-
-
-#if CONFIG_PAGING_LEVELS > 2
- /* We may have caused some PAE l3 entries to change: need to
- * fix up the copies of them in various places */
- if ( sh_flags & (SH2F_L2_PAE|SH2F_L2H_PAE) )
- sh2_pae_recopy(v->domain);
-#endif
-
- /* If that didn't catch the shadows, something is wrong */
- if ( all && (pg->count_info & PGC_page_table) )
- {
- SHADOW2_ERROR("can't find all shadows of mfn %05lx (shadow2_flags=%08x)\n",
- mfn_x(gmfn), pg->shadow2_flags);
- domain_crash(v->domain);
- }
-}
-
-void
-shadow2_remove_all_shadows_and_parents(struct vcpu *v, mfn_t gmfn)
-/* Even harsher: this is a HVM page that we thing is no longer a pagetable.
- * Unshadow it, and recursively unshadow pages that reference it. */
-{
- shadow2_remove_all_shadows(v, gmfn);
- /* XXX TODO:
- * Rework this hashtable walker to return a linked-list of all
- * the shadows it modified, then do breadth-first recursion
- * to find the way up to higher-level tables and unshadow them too.
- *
- * The current code (just tearing down each page's shadows as we
- * detect that it is not a pagetable) is correct, but very slow.
- * It means extra emulated writes and slows down removal of mappings. */
-}
-
-/**************************************************************************/
-
-void sh2_update_paging_modes(struct vcpu *v)
-{
- struct domain *d = v->domain;
- struct shadow2_paging_mode *old_mode = v->arch.shadow2.mode;
- mfn_t old_guest_table;
-
- ASSERT(shadow2_lock_is_acquired(d));
-
- // Valid transitions handled by this function:
- // - For PV guests:
- // - after a shadow mode has been changed
- // - For HVM guests:
- // - after a shadow mode has been changed
- // - changes in CR0.PG, CR4.PAE, CR4.PSE, or CR4.PGE
- //
-
- // Avoid determining the current shadow2 mode for uninitialized CPUs, as
- // we can not yet determine whether it is an HVM or PV domain.
- //
- if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) )
- {
- printk("%s: postponing determination of shadow2 mode\n", __func__);
- return;
- }
-
- // First, tear down any old shadow tables held by this vcpu.
- //
- shadow2_detach_old_tables(v);
-
- if ( !hvm_guest(v) )
- {
- ///
- /// PV guest
- ///
-#if CONFIG_PAGING_LEVELS == 4
- if ( pv_32bit_guest(v) )
- v->arch.shadow2.mode = &SHADOW2_INTERNAL_NAME(sh2_paging_mode,4,3);
- else
- v->arch.shadow2.mode = &SHADOW2_INTERNAL_NAME(sh2_paging_mode,4,4);
-#elif CONFIG_PAGING_LEVELS == 3
- v->arch.shadow2.mode = &SHADOW2_INTERNAL_NAME(sh2_paging_mode,3,3);
-#elif CONFIG_PAGING_LEVELS == 2
- v->arch.shadow2.mode = &SHADOW2_INTERNAL_NAME(sh2_paging_mode,2,2);
-#else
-#error unexpected paging mode
-#endif
- }
- else
- {
- ///
- /// HVM guest
- ///
- ASSERT(shadow2_mode_translate(d));
- ASSERT(shadow2_mode_external(d));
-
- v->arch.shadow2.hvm_paging_enabled = !!hvm_paging_enabled(v);
- if ( !v->arch.shadow2.hvm_paging_enabled )
- {
-
- /* Set v->arch.guest_table to use the p2m map, and choose
- * the appropriate shadow mode */
- old_guest_table = pagetable_get_mfn(v->arch.guest_table);
-#if CONFIG_PAGING_LEVELS == 2
- v->arch.guest_table =
- pagetable_from_pfn(pagetable_get_pfn(d->arch.phys_table));
- v->arch.shadow2.mode = &SHADOW2_INTERNAL_NAME(sh2_paging_mode,2,2);
-#elif CONFIG_PAGING_LEVELS == 3
- v->arch.guest_table =
- pagetable_from_pfn(pagetable_get_pfn(d->arch.phys_table));
- v->arch.shadow2.mode = &SHADOW2_INTERNAL_NAME(sh2_paging_mode,3,3);
-#else /* CONFIG_PAGING_LEVELS == 4 */
- {
- l4_pgentry_t *l4e;
- /* Use the start of the first l3 table as a PAE l3 */
- ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0);
- l4e = sh2_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
- ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT);
- v->arch.guest_table =
- pagetable_from_pfn(l4e_get_pfn(l4e[0]));
- sh2_unmap_domain_page(l4e);
- }
- v->arch.shadow2.mode = &SHADOW2_INTERNAL_NAME(sh2_paging_mode,3,3);
-#endif
- /* Fix up refcounts on guest_table */
- get_page(mfn_to_page(pagetable_get_mfn(v->arch.guest_table)), d);
- if ( mfn_x(old_guest_table) != 0 )
- put_page(mfn_to_page(old_guest_table));
- }
- else
- {
-#ifdef __x86_64__
- if ( hvm_long_mode_enabled(v) )
- {
- // long mode guest...
- v->arch.shadow2.mode =
- &SHADOW2_INTERNAL_NAME(sh2_paging_mode, 4, 4);
- }
- else
-#endif
- if ( hvm_get_guest_ctrl_reg(v, 4) & X86_CR4_PAE )
- {
-#if CONFIG_PAGING_LEVELS >= 3
- // 32-bit PAE mode guest...
- v->arch.shadow2.mode =
- &SHADOW2_INTERNAL_NAME(sh2_paging_mode, 3, 3);
-#else
- SHADOW2_ERROR("PAE not supported in 32-bit Xen\n");
- domain_crash(d);
- return;
-#endif
- }
- else
- {
- // 32-bit 2 level guest...
-#if CONFIG_PAGING_LEVELS >= 3
- v->arch.shadow2.mode =
- &SHADOW2_INTERNAL_NAME(sh2_paging_mode, 3, 2);
-#else
- v->arch.shadow2.mode =
- &SHADOW2_INTERNAL_NAME(sh2_paging_mode, 2, 2);
-#endif
- }
- }
-
- if ( pagetable_get_pfn(v->arch.monitor_table) == 0 )
- {
- mfn_t mmfn = shadow2_make_monitor_table(v);
- v->arch.monitor_table = pagetable_from_mfn(mmfn);
- v->arch.monitor_vtable = sh2_map_domain_page(mmfn);
- }
-
- if ( v->arch.shadow2.mode != old_mode )
- {
- SHADOW2_PRINTK("new paging mode: d=%u v=%u g=%u s=%u "
- "(was g=%u s=%u)\n",
- d->domain_id, v->vcpu_id,
- v->arch.shadow2.mode->guest_levels,
- v->arch.shadow2.mode->shadow_levels,
- old_mode ? old_mode->guest_levels : 0,
- old_mode ? old_mode->shadow_levels : 0);
- if ( old_mode &&
- (v->arch.shadow2.mode->shadow_levels !=
- old_mode->shadow_levels) )
- {
- /* Need to make a new monitor table for the new mode */
- mfn_t new_mfn, old_mfn;
-
- if ( v != current )
- {
- SHADOW2_ERROR("Some third party (d=%u v=%u) is changing "
- "this HVM vcpu's (d=%u v=%u) paging mode!\n",
- current->domain->domain_id, current->vcpu_id,
- v->domain->domain_id, v->vcpu_id);
- domain_crash(v->domain);
- return;
- }
-
- sh2_unmap_domain_page(v->arch.monitor_vtable);
- old_mfn = pagetable_get_mfn(v->arch.monitor_table);
- v->arch.monitor_table = pagetable_null();
- new_mfn = v->arch.shadow2.mode->make_monitor_table(v);
- v->arch.monitor_table = pagetable_from_mfn(new_mfn);
- v->arch.monitor_vtable = sh2_map_domain_page(new_mfn);
- SHADOW2_PRINTK("new monitor table %"SH2_PRI_mfn "\n",
- mfn_x(new_mfn));
-
- /* Don't be running on the old monitor table when we
- * pull it down! Switch CR3, and warn the HVM code that
- * its host cr3 has changed. */
- make_cr3(v, mfn_x(new_mfn));
- write_ptbase(v);
- hvm_update_host_cr3(v);
- old_mode->destroy_monitor_table(v, old_mfn);
- }
- }
-
- // XXX -- Need to deal with changes in CR4.PSE and CR4.PGE.
- // These are HARD: think about the case where two CPU's have
- // different values for CR4.PSE and CR4.PGE at the same time.
- // This *does* happen, at least for CR4.PGE...
- }
-
- v->arch.shadow2.mode->update_cr3(v);
-}
-
-/**************************************************************************/
-/* Turning on and off shadow2 features */
-
-static void sh2_new_mode(struct domain *d, u32 new_mode)
-/* Inform all the vcpus that the shadow mode has been changed */
-{
- struct vcpu *v;
-
- ASSERT(shadow2_lock_is_acquired(d));
- ASSERT(d != current->domain);
- d->arch.shadow2.mode = new_mode;
- if ( new_mode & SHM2_translate )
- shadow2_audit_p2m(d);
- for_each_vcpu(d, v)
- sh2_update_paging_modes(v);
-}
-
-static int shadow2_enable(struct domain *d, u32 mode)
-/* Turn on "permanent" shadow features: external, translate, refcount.
- * Can only be called once on a domain, and these features cannot be
- * disabled.
- * Returns 0 for success, -errno for failure. */
-{
- unsigned int old_pages;
- int rv = 0;
-
- mode |= SHM2_enable;
-
- domain_pause(d);
- shadow2_lock(d);
-
- /* Sanity check the arguments */
- if ( (d == current->domain) ||
- shadow2_mode_enabled(d) ||
- ((mode & SHM2_external) && !(mode & SHM2_translate)) )
- {
- rv = -EINVAL;
- goto out;
- }
-
- // XXX -- eventually would like to require that all memory be allocated
- // *after* shadow2_enabled() is called... So here, we would test to make
- // sure that d->page_list is empty.
-#if 0
- spin_lock(&d->page_alloc_lock);
- if ( !list_empty(&d->page_list) )
- {
- spin_unlock(&d->page_alloc_lock);
- rv = -EINVAL;
- goto out;
- }
- spin_unlock(&d->page_alloc_lock);
-#endif
-
- /* Init the shadow memory allocation if the user hasn't done so */
- old_pages = d->arch.shadow2.total_pages;
- if ( old_pages == 0 )
- if ( set_sh2_allocation(d, 256, NULL) != 0 ) /* Use at least 1MB */
- {
- set_sh2_allocation(d, 0, NULL);
- rv = -ENOMEM;
- goto out;
- }
-
- /* Init the hash table */
- if ( shadow2_hash_alloc(d) != 0 )
- {
- set_sh2_allocation(d, old_pages, NULL);
- rv = -ENOMEM;
- goto out;
- }
-
- /* Init the P2M table */
- if ( mode & SHM2_translate )
- if ( !shadow2_alloc_p2m_table(d) )
- {
- shadow2_hash_teardown(d);
- set_sh2_allocation(d, old_pages, NULL);
- shadow2_p2m_teardown(d);
- rv = -ENOMEM;
- goto out;
- }
-
- /* Update the bits */
- sh2_new_mode(d, mode);
- shadow2_audit_p2m(d);
- out:
- shadow2_unlock(d);
- domain_unpause(d);
- return 0;
-}
-
-void shadow2_teardown(struct domain *d)
-/* Destroy the shadow pagetables of this domain and free its shadow memory.
- * Should only be called for dying domains. */
-{
- struct vcpu *v;
- mfn_t mfn;
-
- ASSERT(test_bit(_DOMF_dying, &d->domain_flags));
- ASSERT(d != current->domain);
-
- if ( !shadow2_lock_is_acquired(d) )
- shadow2_lock(d); /* Keep various asserts happy */
-
- if ( shadow2_mode_enabled(d) )
- {
- /* Release the shadow and monitor tables held by each vcpu */
- for_each_vcpu(d, v)
- {
- shadow2_detach_old_tables(v);
- if ( shadow2_mode_external(d) )
- {
- mfn = pagetable_get_mfn(v->arch.monitor_table);
- if ( valid_mfn(mfn) && (mfn_x(mfn) != 0) )
- shadow2_destroy_monitor_table(v, mfn);
- v->arch.monitor_table = pagetable_null();
- }
- }
- }
-
- if ( d->arch.shadow2.total_pages != 0 )
- {
- SHADOW2_PRINTK("teardown of domain %u starts."
- " Shadow pages total = %u, free = %u, p2m=%u\n",
- d->domain_id,
- d->arch.shadow2.total_pages,
- d->arch.shadow2.free_pages,
- d->arch.shadow2.p2m_pages);
- /* Destroy all the shadows and release memory to domheap */
- set_sh2_allocation(d, 0, NULL);
- /* Release the hash table back to xenheap */
- if (d->arch.shadow2.hash_table)
- shadow2_hash_teardown(d);
- /* Release the log-dirty bitmap of dirtied pages */
- sh2_free_log_dirty_bitmap(d);
- /* Should not have any more memory held */
- SHADOW2_PRINTK("teardown done."
- " Shadow pages total = %u, free = %u, p2m=%u\n",
- d->arch.shadow2.total_pages,
- d->arch.shadow2.free_pages,
- d->arch.shadow2.p2m_pages);
- ASSERT(d->arch.shadow2.total_pages == 0);
- }
-
- /* We leave the "permanent" shadow modes enabled, but clear the
- * log-dirty mode bit. We don't want any more mark_dirty()
- * calls now that we've torn down the bitmap */
- d->arch.shadow2.mode &= ~SHM2_log_dirty;
-
- shadow2_unlock(d);
-}
-
-void shadow2_final_teardown(struct domain *d)
-/* Called by arch_domain_destroy(), when it's safe to pull down the p2m map. */
-{
-
- SHADOW2_PRINTK("dom %u final teardown starts."
- " Shadow pages total = %u, free = %u, p2m=%u\n",
- d->domain_id,
- d->arch.shadow2.total_pages,
- d->arch.shadow2.free_pages,
- d->arch.shadow2.p2m_pages);
-
- /* Double-check that the domain didn't have any shadow memory.
- * It is possible for a domain that never got domain_kill()ed
- * to get here with its shadow allocation intact. */
- if ( d->arch.shadow2.total_pages != 0 )
- shadow2_teardown(d);
-
- /* It is now safe to pull down the p2m map. */
- if ( d->arch.shadow2.p2m_pages != 0 )
- shadow2_p2m_teardown(d);
-
- SHADOW2_PRINTK("dom %u final teardown done."
- " Shadow pages total = %u, free = %u, p2m=%u\n",
- d->domain_id,
- d->arch.shadow2.total_pages,
- d->arch.shadow2.free_pages,
- d->arch.shadow2.p2m_pages);
-}
-
-static int shadow2_one_bit_enable(struct domain *d, u32 mode)
-/* Turn on a single shadow mode feature */
-{
- ASSERT(shadow2_lock_is_acquired(d));
-
- /* Sanity check the call */
- if ( d == current->domain || (d->arch.shadow2.mode & mode) )
- {
- return -EINVAL;
- }
-
- if ( d->arch.shadow2.mode == 0 )
- {
- /* Init the shadow memory allocation and the hash table */
- if ( set_sh2_allocation(d, 1, NULL) != 0
- || shadow2_hash_alloc(d) != 0 )
- {
- set_sh2_allocation(d, 0, NULL);
- return -ENOMEM;
- }
- }
-
- /* Update the bits */
- sh2_new_mode(d, d->arch.shadow2.mode | mode);
-
- return 0;
-}
-
-static int shadow2_one_bit_disable(struct domain *d, u32 mode)
-/* Turn off a single shadow mode feature */
-{
- struct vcpu *v;
- ASSERT(shadow2_lock_is_acquired(d));
-
- /* Sanity check the call */
- if ( d == current->domain || !(d->arch.shadow2.mode & mode) )
- {
- return -EINVAL;
- }
-
- /* Update the bits */
- sh2_new_mode(d, d->arch.shadow2.mode & ~mode);
- if ( d->arch.shadow2.mode == 0 )
- {
- /* Get this domain off shadows */
- SHADOW2_PRINTK("un-shadowing of domain %u starts."
- " Shadow pages total = %u, free = %u, p2m=%u\n",
- d->domain_id,
- d->arch.shadow2.total_pages,
- d->arch.shadow2.free_pages,
- d->arch.shadow2.p2m_pages);
- for_each_vcpu(d, v)
- {
- shadow2_detach_old_tables(v);
-#if CONFIG_PAGING_LEVELS == 4
- if ( !(v->arch.flags & TF_kernel_mode) )
- make_cr3(v, pagetable_get_pfn(v->arch.guest_table_user));
- else
-#endif
- make_cr3(v, pagetable_get_pfn(v->arch.guest_table));
-
- }
-
- /* Pull down the memory allocation */
- if ( set_sh2_allocation(d, 0, NULL) != 0 )
- {
- // XXX - How can this occur?
- // Seems like a bug to return an error now that we've
- // disabled the relevant shadow mode.
- //
- return -ENOMEM;
- }
- shadow2_hash_teardown(d);
- SHADOW2_PRINTK("un-shadowing of domain %u done."
- " Shadow pages total = %u, free = %u, p2m=%u\n",
- d->domain_id,
- d->arch.shadow2.total_pages,
- d->arch.shadow2.free_pages,
- d->arch.shadow2.p2m_pages);
- }
-
- return 0;
-}
-
-/* Enable/disable ops for the "test" and "log-dirty" modes */
-int shadow2_test_enable(struct domain *d)
-{
- int ret;
-
- domain_pause(d);
- shadow2_lock(d);
-
- if ( shadow2_mode_enabled(d) )
- {
- SHADOW2_ERROR("Don't support enabling test mode"
- "on already shadowed doms\n");
- ret = -EINVAL;
- goto out;
- }
-
- ret = shadow2_one_bit_enable(d, SHM2_enable);
- out:
- shadow2_unlock(d);
- domain_unpause(d);
-
- return ret;
-}
-
-int shadow2_test_disable(struct domain *d)
-{
- int ret;
-
- domain_pause(d);
- shadow2_lock(d);
- ret = shadow2_one_bit_disable(d, SHM2_enable);
- shadow2_unlock(d);
- domain_unpause(d);
-
- return ret;
-}
-
-static int
-sh2_alloc_log_dirty_bitmap(struct domain *d)
-{
- ASSERT(d->arch.shadow2.dirty_bitmap == NULL);
- d->arch.shadow2.dirty_bitmap_size =
- (d->shared_info->arch.max_pfn + (BITS_PER_LONG - 1)) &
- ~(BITS_PER_LONG - 1);
- d->arch.shadow2.dirty_bitmap =
- xmalloc_array(unsigned long,
- d->arch.shadow2.dirty_bitmap_size / BITS_PER_LONG);
- if ( d->arch.shadow2.dirty_bitmap == NULL )
- {
- d->arch.shadow2.dirty_bitmap_size = 0;
- return -ENOMEM;
- }
- memset(d->arch.shadow2.dirty_bitmap, 0, d->arch.shadow2.dirty_bitmap_size/8);
-
- return 0;
-}
-
-static void
-sh2_free_log_dirty_bitmap(struct domain *d)
-{
- d->arch.shadow2.dirty_bitmap_size = 0;
- if ( d->arch.shadow2.dirty_bitmap )
- {
- xfree(d->arch.shadow2.dirty_bitmap);
- d->arch.shadow2.dirty_bitmap = NULL;
- }
-}
-
-static int shadow2_log_dirty_enable(struct domain *d)
-{
- int ret;
-
- domain_pause(d);
- shadow2_lock(d);
-
- if ( shadow2_mode_log_dirty(d) )
- {
- ret = -EINVAL;
- goto out;
- }
-
- if ( shadow2_mode_enabled(d) )
- {
- SHADOW2_ERROR("Don't (yet) support enabling log-dirty"
- "on already shadowed doms\n");
- ret = -EINVAL;
- goto out;
- }
-
- ret = sh2_alloc_log_dirty_bitmap(d);
- if ( ret != 0 )
- {
- sh2_free_log_dirty_bitmap(d);
- goto out;
- }
-
- ret = shadow2_one_bit_enable(d, SHM2_log_dirty);
- if ( ret != 0 )
- sh2_free_log_dirty_bitmap(d);
-
- out:
- shadow2_unlock(d);
- domain_unpause(d);
- return ret;
-}
-
-static int shadow2_log_dirty_disable(struct domain *d)
-{
- int ret;
-
- domain_pause(d);
- shadow2_lock(d);
- ret = shadow2_one_bit_disable(d, SHM2_log_dirty);
- if ( !shadow2_mode_log_dirty(d) )
- sh2_free_log_dirty_bitmap(d);
- shadow2_unlock(d);
- domain_unpause(d);
-
- return ret;
-}
-
-/**************************************************************************/
-/* P2M map manipulations */
-
-static void
-sh2_p2m_remove_page(struct domain *d, unsigned long gfn, unsigned long mfn)
-{
- struct vcpu *v;
-
- if ( !shadow2_mode_translate(d) )
- return;
-
- v = current;
- if ( v->domain != d )
- v = d->vcpu[0];
-
-
- SHADOW2_DEBUG(P2M, "removing gfn=%#lx mfn=%#lx\n", gfn, mfn);
-
- ASSERT(mfn_x(sh2_gfn_to_mfn(d, gfn)) == mfn);
- //ASSERT(sh2_mfn_to_gfn(d, mfn) == gfn);
-
- shadow2_remove_all_shadows_and_parents(v, _mfn(mfn));
- if ( shadow2_remove_all_mappings(v, _mfn(mfn)) )
- flush_tlb_mask(d->domain_dirty_cpumask);
- shadow2_set_p2m_entry(d, gfn, _mfn(INVALID_MFN));
- set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
-}
-
-void
-shadow2_guest_physmap_remove_page(struct domain *d, unsigned long gfn,
- unsigned long mfn)
-{
- shadow2_lock(d);
- shadow2_audit_p2m(d);
- sh2_p2m_remove_page(d, gfn, mfn);
- shadow2_audit_p2m(d);
- shadow2_unlock(d);
-}
-
-void
-shadow2_guest_physmap_add_page(struct domain *d, unsigned long gfn,
- unsigned long mfn)
-{
- struct vcpu *v;
- unsigned long ogfn;
- mfn_t omfn;
-
- if ( !shadow2_mode_translate(d) )
- return;
-
- v = current;
- if ( v->domain != d )
- v = d->vcpu[0];
-
- shadow2_lock(d);
- shadow2_audit_p2m(d);
-
- SHADOW2_DEBUG(P2M, "adding gfn=%#lx mfn=%#lx\n", gfn, mfn);
-
- omfn = sh2_gfn_to_mfn(d, gfn);
- if ( valid_mfn(omfn) )
- {
- /* Get rid of the old mapping, especially any shadows */
- shadow2_remove_all_shadows_and_parents(v, omfn);
- if ( shadow2_remove_all_mappings(v, omfn) )
- flush_tlb_mask(d->domain_dirty_cpumask);
- set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
- }
-
- ogfn = sh2_mfn_to_gfn(d, _mfn(mfn));
- if (
-#ifdef __x86_64__
- (ogfn != 0x5555555555555555L)
-#else
- (ogfn != 0x55555555L)
-#endif
- && (ogfn != INVALID_M2P_ENTRY)
- && (ogfn != gfn) )
- {
- /* This machine frame is already mapped at another physical address */
- SHADOW2_DEBUG(P2M, "aliased! mfn=%#lx, old gfn=%#lx, new gfn=%#lx\n",
- mfn, ogfn, gfn);
- if ( valid_mfn(omfn = sh2_gfn_to_mfn(d, ogfn)) )
- {
- SHADOW2_DEBUG(P2M, "old gfn=%#lx -> mfn %#lx\n",
- ogfn , mfn_x(omfn));
- if ( mfn_x(omfn) == mfn )
- sh2_p2m_remove_page(d, ogfn, mfn);
- }
- }
-
- shadow2_set_p2m_entry(d, gfn, _mfn(mfn));
- set_gpfn_from_mfn(mfn, gfn);
- shadow2_audit_p2m(d);
- shadow2_unlock(d);
-}
-
-/**************************************************************************/
-/* Log-dirty mode support */
-
-/* Convert a shadow to log-dirty mode. */
-void shadow2_convert_to_log_dirty(struct vcpu *v, mfn_t smfn)
-{
- BUG();
-}
-
-
-/* Read a domain's log-dirty bitmap and stats.
- * If the operation is a CLEAN, clear the bitmap and stats as well. */
-static int shadow2_log_dirty_op(
- struct domain *d, struct xen_domctl_shadow_op *sc)
-{
- int i, rv = 0, clean = 0;
-
- domain_pause(d);
- shadow2_lock(d);
-
- clean = (sc->op == XEN_DOMCTL_SHADOW_OP_CLEAN);
-
- SHADOW2_DEBUG(LOGDIRTY, "log-dirty %s: dom %u faults=%u dirty=%u\n",
- (clean) ? "clean" : "peek",
- d->domain_id,
- d->arch.shadow2.fault_count,
- d->arch.shadow2.dirty_count);
-
- sc->stats.fault_count = d->arch.shadow2.fault_count;
- sc->stats.dirty_count = d->arch.shadow2.dirty_count;
-
- if ( clean )
- {
- struct list_head *l, *t;
- struct page_info *pg;
-
- /* Need to revoke write access to the domain's pages again.
- * In future, we'll have a less heavy-handed approach to this,
- * but for now, we just unshadow everything except Xen. */
- list_for_each_safe(l, t, &d->arch.shadow2.toplevel_shadows)
- {
- pg = list_entry(l, struct page_info, list);
- shadow2_unhook_mappings(d->vcpu[0], page_to_mfn(pg));
- }
-
- d->arch.shadow2.fault_count = 0;
- d->arch.shadow2.dirty_count = 0;
- }
-
- if ( guest_handle_is_null(sc->dirty_bitmap) ||
- (d->arch.shadow2.dirty_bitmap == NULL) )
- {
- rv = -EINVAL;
- goto out;
- }
-
- if ( sc->pages > d->arch.shadow2.dirty_bitmap_size )
- sc->pages = d->arch.shadow2.dirty_bitmap_size;
-
-#define CHUNK (8*1024) /* Transfer and clear in 1kB chunks for L1 cache. */
- for ( i = 0; i < sc->pages; i += CHUNK )
- {
- int bytes = ((((sc->pages - i) > CHUNK)
- ? CHUNK
- : (sc->pages - i)) + 7) / 8;
-
- if ( copy_to_guest_offset(
- sc->dirty_bitmap,
- i/(8*sizeof(unsigned long)),
- d->arch.shadow2.dirty_bitmap + (i/(8*sizeof(unsigned long))),
- (bytes + sizeof(unsigned long) - 1) / sizeof(unsigned long)) )
- {
- rv = -EINVAL;
- goto out;
- }
-
- if ( clean )
- memset(d->arch.shadow2.dirty_bitmap + (i/(8*sizeof(unsigned long))),
- 0, bytes);
- }
-#undef CHUNK
-
- out:
- shadow2_unlock(d);
- domain_unpause(d);
- return 0;
-}
-
-
-/* Mark a page as dirty */
-void sh2_do_mark_dirty(struct domain *d, mfn_t gmfn)
-{
- unsigned long pfn;
-
- ASSERT(shadow2_lock_is_acquired(d));
- ASSERT(shadow2_mode_log_dirty(d));
-
- if ( !valid_mfn(gmfn) )
- return;
-
- ASSERT(d->arch.shadow2.dirty_bitmap != NULL);
-
- /* We /really/ mean PFN here, even for non-translated guests. */
- pfn = get_gpfn_from_mfn(mfn_x(gmfn));
-
- /*
- * Values with the MSB set denote MFNs that aren't really part of the
- * domain's pseudo-physical memory map (e.g., the shared info frame).
- * Nothing to do here...
- */
- if ( unlikely(!VALID_M2P(pfn)) )
- return;
-
- /* N.B. Can use non-atomic TAS because protected by shadow2_lock. */
- if ( likely(pfn < d->arch.shadow2.dirty_bitmap_size) )
- {
- if ( !__test_and_set_bit(pfn, d->arch.shadow2.dirty_bitmap) )
- {
- SHADOW2_DEBUG(LOGDIRTY,
- "marked mfn %" SH2_PRI_mfn " (pfn=%lx), dom %d\n",
- mfn_x(gmfn), pfn, d->domain_id);
- d->arch.shadow2.dirty_count++;
- }
- }
- else
- {
- SHADOW2_PRINTK("mark_dirty OOR! "
- "mfn=%" SH2_PRI_mfn " pfn=%lx max=%x (dom %d)\n"
- "owner=%d c=%08x t=%" PRtype_info "\n",
- mfn_x(gmfn),
- pfn,
- d->arch.shadow2.dirty_bitmap_size,
- d->domain_id,
- (page_get_owner(mfn_to_page(gmfn))
- ? page_get_owner(mfn_to_page(gmfn))->domain_id
- : -1),
- mfn_to_page(gmfn)->count_info,
- mfn_to_page(gmfn)->u.inuse.type_info);
- }
-}
-
-
-/**************************************************************************/
-/* Shadow-control XEN_DOMCTL dispatcher */
-
-int shadow2_domctl(struct domain *d,
- xen_domctl_shadow_op_t *sc,
- XEN_GUEST_HANDLE(xen_domctl_t) u_domctl)
-{
- int rc, preempted = 0;
-
- if ( unlikely(d == current->domain) )
- {
- DPRINTK("Don't try to do a shadow op on yourself!\n");
- return -EINVAL;
- }
-
- switch ( sc->op )
- {
- case XEN_DOMCTL_SHADOW_OP_OFF:
- if ( shadow2_mode_log_dirty(d) )
- if ( (rc = shadow2_log_dirty_disable(d)) != 0 )
- return rc;
- if ( d->arch.shadow2.mode & SHM2_enable )
- if ( (rc = shadow2_test_disable(d)) != 0 )
- return rc;
- return 0;
-
- case XEN_DOMCTL_SHADOW_OP_ENABLE_TEST:
- return shadow2_test_enable(d);
-
- case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY:
- return shadow2_log_dirty_enable(d);
-
- case XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE:
- return shadow2_enable(d, SHM2_refcounts|SHM2_translate);
-
- case XEN_DOMCTL_SHADOW_OP_CLEAN:
- case XEN_DOMCTL_SHADOW_OP_PEEK:
- return shadow2_log_dirty_op(d, sc);
-
- case XEN_DOMCTL_SHADOW_OP_ENABLE:
- if ( sc->mode & XEN_DOMCTL_SHADOW_ENABLE_LOG_DIRTY )
- return shadow2_log_dirty_enable(d);
- return shadow2_enable(d, sc->mode << SHM2_shift);
-
- case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
- sc->mb = shadow2_get_allocation(d);
- return 0;
-
- case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
- rc = shadow2_set_allocation(d, sc->mb, &preempted);
- if ( preempted )
- /* Not finished. Set up to re-run the call. */
- rc = hypercall_create_continuation(
- __HYPERVISOR_domctl, "h", u_domctl);
- else
- /* Finished. Return the new allocation */
- sc->mb = shadow2_get_allocation(d);
- return rc;
-
- default:
- SHADOW2_ERROR("Bad shadow op %u\n", sc->op);
- return -EINVAL;
- }
-}
-
-
-/**************************************************************************/
-/* Auditing shadow tables */
-
-#if SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES_FULL
-
-void shadow2_audit_tables(struct vcpu *v)
-{
- /* Dispatch table for getting per-type functions */
- static hash_callback_t callbacks[16] = {
- NULL, /* none */
-#if CONFIG_PAGING_LEVELS == 2
- SHADOW2_INTERNAL_NAME(sh2_audit_l1_table,2,2), /* l1_32 */
- SHADOW2_INTERNAL_NAME(sh2_audit_fl1_table,2,2), /* fl1_32 */
- SHADOW2_INTERNAL_NAME(sh2_audit_l2_table,2,2), /* l2_32 */
-#else
- SHADOW2_INTERNAL_NAME(sh2_audit_l1_table,3,2), /* l1_32 */
- SHADOW2_INTERNAL_NAME(sh2_audit_fl1_table,3,2), /* fl1_32 */
- SHADOW2_INTERNAL_NAME(sh2_audit_l2_table,3,2), /* l2_32 */
- SHADOW2_INTERNAL_NAME(sh2_audit_l1_table,3,3), /* l1_pae */
- SHADOW2_INTERNAL_NAME(sh2_audit_fl1_table,3,3), /* fl1_pae */
- SHADOW2_INTERNAL_NAME(sh2_audit_l2_table,3,3), /* l2_pae */
- SHADOW2_INTERNAL_NAME(sh2_audit_l2_table,3,3), /* l2h_pae */
- SHADOW2_INTERNAL_NAME(sh2_audit_l3_table,3,3), /* l3_pae */
-#if CONFIG_PAGING_LEVELS >= 4
- SHADOW2_INTERNAL_NAME(sh2_audit_l1_table,4,4), /* l1_64 */
- SHADOW2_INTERNAL_NAME(sh2_audit_fl1_table,4,4), /* fl1_64 */
- SHADOW2_INTERNAL_NAME(sh2_audit_l2_table,4,4), /* l2_64 */
- SHADOW2_INTERNAL_NAME(sh2_audit_l3_table,4,4), /* l3_64 */
- SHADOW2_INTERNAL_NAME(sh2_audit_l4_table,4,4), /* l4_64 */
-#endif /* CONFIG_PAGING_LEVELS >= 4 */
-#endif /* CONFIG_PAGING_LEVELS > 2 */
- NULL /* All the rest */
- };
- unsigned int mask;
-
- if ( !(SHADOW2_AUDIT_ENABLE) )
- return;
-
- if ( SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES_FULL )
- mask = ~1; /* Audit every table in the system */
- else
- {
- /* Audit only the current mode's tables */
- switch ( v->arch.shadow2.mode->guest_levels )
- {
- case 2: mask = (SH2F_L1_32|SH2F_FL1_32|SH2F_L2_32); break;
- case 3: mask = (SH2F_L1_PAE|SH2F_FL1_PAE|SH2F_L2_PAE
- |SH2F_L2H_PAE|SH2F_L3_PAE); break;
- case 4: mask = (SH2F_L1_64|SH2F_FL1_64|SH2F_L2_64
- |SH2F_L3_64|SH2F_L4_64); break;
- default: BUG();
- }
- }
-
- hash_foreach(v, ~1, callbacks, _mfn(INVALID_MFN));
-}
-
-#endif /* Shadow audit */
-
-
-/**************************************************************************/
-/* Auditing p2m tables */
-
-#if SHADOW2_AUDIT & SHADOW2_AUDIT_P2M
-
-void shadow2_audit_p2m(struct domain *d)
-{
- struct list_head *entry;
- struct page_info *page;
- struct domain *od;
- unsigned long mfn, gfn, m2pfn, lp2mfn = 0;
- mfn_t p2mfn;
- unsigned long orphans_d = 0, orphans_i = 0, mpbad = 0, pmbad = 0;
- int test_linear;
-
- if ( !(SHADOW2_AUDIT_ENABLE) || !shadow2_mode_translate(d) )
- return;
-
- //SHADOW2_PRINTK("p2m audit starts\n");
-
- test_linear = ( (d == current->domain) && current->arch.monitor_vtable );
- if ( test_linear )
- local_flush_tlb();
-
- /* Audit part one: walk the domain's page allocation list, checking
- * the m2p entries. */
- for ( entry = d->page_list.next;
- entry != &d->page_list;
- entry = entry->next )
- {
- page = list_entry(entry, struct page_info, list);
- mfn = mfn_x(page_to_mfn(page));
-
- // SHADOW2_PRINTK("auditing guest page, mfn=%#lx\n", mfn);
-
- od = page_get_owner(page);
-
- if ( od != d )
- {
- SHADOW2_PRINTK("wrong owner %#lx -> %p(%u) != %p(%u)\n",
- mfn, od, (od?od->domain_id:-1), d, d->domain_id);
- continue;
- }
-
- gfn = get_gpfn_from_mfn(mfn);
- if ( gfn == INVALID_M2P_ENTRY )
- {
- orphans_i++;
- //SHADOW2_PRINTK("orphaned guest page: mfn=%#lx has invalid gfn\n",
- // mfn);
- continue;
- }
-
- if ( gfn == 0x55555555 )
- {
- orphans_d++;
- //SHADOW2_PRINTK("orphaned guest page: mfn=%#lx has debug gfn\n",
- // mfn);
- continue;
- }
-
- p2mfn = sh2_gfn_to_mfn_foreign(d, gfn);
- if ( mfn_x(p2mfn) != mfn )
- {
- mpbad++;
- SHADOW2_PRINTK("map mismatch mfn %#lx -> gfn %#lx -> mfn %#lx"
- " (-> gfn %#lx)\n",
- mfn, gfn, mfn_x(p2mfn),
- (mfn_valid(p2mfn)
- ? get_gpfn_from_mfn(mfn_x(p2mfn))
- : -1u));
- /* This m2p entry is stale: the domain has another frame in
- * this physical slot. No great disaster, but for neatness,
- * blow away the m2p entry. */
- set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
- }
-
- if ( test_linear )
- {
- lp2mfn = get_mfn_from_gpfn(gfn);
- if ( lp2mfn != mfn_x(p2mfn) )
- {
- SHADOW2_PRINTK("linear mismatch gfn %#lx -> mfn %#lx "
- "(!= mfn %#lx)\n", gfn, lp2mfn, p2mfn);
- }
- }
-
- // SHADOW2_PRINTK("OK: mfn=%#lx, gfn=%#lx, p2mfn=%#lx, lp2mfn=%#lx\n",
- // mfn, gfn, p2mfn, lp2mfn);
- }
-
- /* Audit part two: walk the domain's p2m table, checking the entries. */
- if ( pagetable_get_pfn(d->arch.phys_table) != 0 )
- {
- l2_pgentry_t *l2e;
- l1_pgentry_t *l1e;
- int i1, i2;
-
-#if CONFIG_PAGING_LEVELS == 4
- l4_pgentry_t *l4e;
- l3_pgentry_t *l3e;
- int i3, i4;
- l4e = sh2_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
-#elif CONFIG_PAGING_LEVELS == 3
- l3_pgentry_t *l3e;
- int i3;
- l3e = sh2_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
-#else /* CONFIG_PAGING_LEVELS == 2 */
- l2e = sh2_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
-#endif
-
- gfn = 0;
-#if CONFIG_PAGING_LEVELS >= 3
-#if CONFIG_PAGING_LEVELS >= 4
- for ( i4 = 0; i4 < L4_PAGETABLE_ENTRIES; i4++ )
- {
- if ( !(l4e_get_flags(l4e[i4]) & _PAGE_PRESENT) )
- {
- gfn += 1 << (L4_PAGETABLE_SHIFT - PAGE_SHIFT);
- continue;
- }
- l3e = sh2_map_domain_page(_mfn(l4e_get_pfn(l4e[i4])));
-#endif /* now at levels 3 or 4... */
- for ( i3 = 0;
- i3 < ((CONFIG_PAGING_LEVELS==4) ? L3_PAGETABLE_ENTRIES : 8);
- i3++ )
- {
- if ( !(l3e_get_flags(l3e[i3]) & _PAGE_PRESENT) )
- {
- gfn += 1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
- continue;
- }
- l2e = sh2_map_domain_page(_mfn(l3e_get_pfn(l3e[i3])));
-#endif /* all levels... */
- for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ )
- {
- if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) )
- {
- gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT);
- continue;
- }
- l1e = sh2_map_domain_page(_mfn(l2e_get_pfn(l2e[i2])));
-
- for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ )
- {
- if ( !(l1e_get_flags(l1e[i1]) & _PAGE_PRESENT) )
- continue;
- mfn = l1e_get_pfn(l1e[i1]);
- ASSERT(valid_mfn(_mfn(mfn)));
- m2pfn = get_gpfn_from_mfn(mfn);
- if ( m2pfn != gfn )
- {
- pmbad++;
- SHADOW2_PRINTK("mismatch: gfn %#lx -> mfn %#lx"
- " -> gfn %#lx\n", gfn, mfn, m2pfn);
- BUG();
- }
- }
- sh2_unmap_domain_page(l1e);
- }
-#if CONFIG_PAGING_LEVELS >= 3
- sh2_unmap_domain_page(l2e);
- }
-#if CONFIG_PAGING_LEVELS >= 4
- sh2_unmap_domain_page(l3e);
- }
-#endif
-#endif
-
-#if CONFIG_PAGING_LEVELS == 4
- sh2_unmap_domain_page(l4e);
-#elif CONFIG_PAGING_LEVELS == 3
- sh2_unmap_domain_page(l3e);
-#else /* CONFIG_PAGING_LEVELS == 2 */
- sh2_unmap_domain_page(l2e);
-#endif
-
- }
-
- //SHADOW2_PRINTK("p2m audit complete\n");
- //if ( orphans_i | orphans_d | mpbad | pmbad )
- // SHADOW2_PRINTK("p2m audit found %lu orphans (%lu inval %lu debug)\n",
- // orphans_i + orphans_d, orphans_i, orphans_d,
- if ( mpbad | pmbad )
- SHADOW2_PRINTK("p2m audit found %lu odd p2m, %lu bad m2p entries\n",
- pmbad, mpbad);
-}
-
-#endif /* p2m audit */
-
-/*
- * Local variables:
- * mode: C
- * c-set-style: "BSD"
- * c-basic-offset: 4
- * indent-tabs-mode: nil
- * End:
- */
+++ /dev/null
-/******************************************************************************
- * arch/x86/shadow2.c
- *
- * Simple, mostly-synchronous shadow page tables.
- * Parts of this code are Copyright (c) 2006 by XenSource Inc.
- * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
- * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-// DESIGN QUESTIONS:
-// Why use subshadows for PAE guests?
-// - reduces pressure in the hash table
-// - reduces shadow size (64-vs-4096 bytes of shadow for 32 bytes of guest L3)
-// - would need to find space in the page_info to store 7 more bits of
-// backpointer
-// - independent shadows of 32 byte chunks makes it non-obvious how to quickly
-// figure out when to demote the guest page from l3 status
-//
-// PAE Xen HVM guests are restricted to 8GB of pseudo-physical address space.
-// - Want to map the P2M table into the 16MB RO_MPT hole in Xen's address
-// space for both PV and HVM guests.
-//
-
-#define SHADOW2 1
-
-#include <xen/config.h>
-#include <xen/types.h>
-#include <xen/mm.h>
-#include <xen/trace.h>
-#include <xen/sched.h>
-#include <xen/perfc.h>
-#include <xen/domain_page.h>
-#include <asm/page.h>
-#include <asm/current.h>
-#include <asm/shadow2.h>
-#include <asm/shadow2-private.h>
-#include <asm/shadow2-types.h>
-#include <asm/flushtlb.h>
-#include <asm/hvm/hvm.h>
-
-/* The first cut: an absolutely synchronous, trap-and-emulate version,
- * supporting only HVM guests (and so only "external" shadow mode).
- *
- * THINGS TO DO LATER:
- *
- * FIX GVA_TO_GPA
- * The current interface returns an unsigned long, which is not big enough
- * to hold a physical address in PAE. Should return a gfn instead.
- *
- * TEARDOWN HEURISTICS
- * Also: have a heuristic for when to destroy a previous paging-mode's
- * shadows. When a guest is done with its start-of-day 32-bit tables
- * and reuses the memory we want to drop those shadows. Start with
- * shadows in a page in two modes as a hint, but beware of clever tricks
- * like reusing a pagetable for both PAE and 64-bit during boot...
- *
- * PAE LINEAR MAPS
- * Rework shadow_get_l*e() to have the option of using map_domain_page()
- * instead of linear maps. Add appropriate unmap_l*e calls in the users.
- * Then we can test the speed difference made by linear maps. If the
- * map_domain_page() version is OK on PAE, we could maybe allow a lightweight
- * l3-and-l2h-only shadow mode for PAE PV guests that would allow them
- * to share l2h pages again.
- *
- * PAE L3 COPYING
- * In this code, we copy all 32 bytes of a PAE L3 every time we change an
- * entry in it, and every time we change CR3. We copy it for the linear
- * mappings (ugh! PAE linear mappings) and we copy it to the low-memory
- * buffer so it fits in CR3. Maybe we can avoid some of this recopying
- * by using the shadow directly in some places.
- * Also, for SMP, need to actually respond to seeing shadow2.pae_flip_pending.
- *
- * GUEST_WALK_TABLES TLB FLUSH COALESCE
- * guest_walk_tables can do up to three remote TLB flushes as it walks to
- * the first l1 of a new pagetable. Should coalesce the flushes to the end,
- * and if we do flush, re-do the walk. If anything has changed, then
- * pause all the other vcpus and do the walk *again*.
- *
- * WP DISABLED
- * Consider how to implement having the WP bit of CR0 set to 0.
- * Since we need to be able to cause write faults to pagetables, this might
- * end up looking like not having the (guest) pagetables present at all in
- * HVM guests...
- *
- * PSE disabled / PSE36
- * We don't support any modes other than PSE enabled, PSE36 disabled.
- * Neither of those would be hard to change, but we'd need to be able to
- * deal with shadows made in one mode and used in another.
- */
-
-#define FETCH_TYPE_PREFETCH 1
-#define FETCH_TYPE_DEMAND 2
-#define FETCH_TYPE_WRITE 4
-typedef enum {
- ft_prefetch = FETCH_TYPE_PREFETCH,
- ft_demand_read = FETCH_TYPE_DEMAND,
- ft_demand_write = FETCH_TYPE_DEMAND | FETCH_TYPE_WRITE,
-} fetch_type_t;
-
-#ifdef DEBUG_TRACE_DUMP
-static char *fetch_type_names[] = {
- [ft_prefetch] "prefetch",
- [ft_demand_read] "demand read",
- [ft_demand_write] "demand write",
-};
-#endif
-
-/* XXX forward declarations */
-#if (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3)
-static unsigned long hvm_pae_copy_root(struct vcpu *v, l3_pgentry_t *l3tab, int clear_res);
-#endif
-static inline void sh2_update_linear_entries(struct vcpu *v);
-
-/**************************************************************************/
-/* Hash table mapping from guest pagetables to shadows
- *
- * Normal case: maps the mfn of a guest page to the mfn of its shadow page.
- * FL1's: maps the *gfn* of the start of a superpage to the mfn of a
- * shadow L1 which maps its "splinters".
- * PAE CR3s: maps the 32-byte aligned, 32-bit CR3 value to the mfn of the
- * PAE L3 info page for that CR3 value.
- */
-
-static inline mfn_t
-get_fl1_shadow_status(struct vcpu *v, gfn_t gfn)
-/* Look for FL1 shadows in the hash table */
-{
- mfn_t smfn = shadow2_hash_lookup(v, gfn_x(gfn),
- PGC_SH2_fl1_shadow >> PGC_SH2_type_shift);
-
- if ( unlikely(shadow2_mode_log_dirty(v->domain) && valid_mfn(smfn)) )
- {
- struct page_info *page = mfn_to_page(smfn);
- if ( !(page->count_info & PGC_SH2_log_dirty) )
- shadow2_convert_to_log_dirty(v, smfn);
- }
-
- return smfn;
-}
-
-static inline mfn_t
-get_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
-/* Look for shadows in the hash table */
-{
- mfn_t smfn = shadow2_hash_lookup(v, mfn_x(gmfn),
- shadow_type >> PGC_SH2_type_shift);
- perfc_incrc(shadow2_get_shadow_status);
-
- if ( unlikely(shadow2_mode_log_dirty(v->domain) && valid_mfn(smfn)) )
- {
- struct page_info *page = mfn_to_page(smfn);
- if ( !(page->count_info & PGC_SH2_log_dirty) )
- shadow2_convert_to_log_dirty(v, smfn);
- }
-
- return smfn;
-}
-
-static inline void
-set_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
-/* Put an FL1 shadow into the hash table */
-{
- SHADOW2_PRINTK("gfn=%"SH2_PRI_gfn", type=%08x, smfn=%05lx\n",
- gfn_x(gfn), PGC_SH2_fl1_shadow, mfn_x(smfn));
-
- if ( unlikely(shadow2_mode_log_dirty(v->domain)) )
- // mark this shadow as a log dirty shadow...
- set_bit(_PGC_SH2_log_dirty, &mfn_to_page(smfn)->count_info);
- else
- clear_bit(_PGC_SH2_log_dirty, &mfn_to_page(smfn)->count_info);
-
- shadow2_hash_insert(v, gfn_x(gfn),
- PGC_SH2_fl1_shadow >> PGC_SH2_type_shift, smfn);
-}
-
-static inline void
-set_shadow2_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
-/* Put a shadow into the hash table */
-{
- struct domain *d = v->domain;
- int res;
-
- SHADOW2_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
- d->domain_id, v->vcpu_id, mfn_x(gmfn),
- shadow_type, mfn_x(smfn));
-
- if ( unlikely(shadow2_mode_log_dirty(d)) )
- // mark this shadow as a log dirty shadow...
- set_bit(_PGC_SH2_log_dirty, &mfn_to_page(smfn)->count_info);
- else
- clear_bit(_PGC_SH2_log_dirty, &mfn_to_page(smfn)->count_info);
-
- res = get_page(mfn_to_page(gmfn), d);
- ASSERT(res == 1);
-
- shadow2_hash_insert(v, mfn_x(gmfn), shadow_type >> PGC_SH2_type_shift,
- smfn);
-}
-
-static inline void
-delete_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
-/* Remove a shadow from the hash table */
-{
- SHADOW2_PRINTK("gfn=%"SH2_PRI_gfn", type=%08x, smfn=%05lx\n",
- gfn_x(gfn), PGC_SH2_fl1_shadow, mfn_x(smfn));
-
- shadow2_hash_delete(v, gfn_x(gfn),
- PGC_SH2_fl1_shadow >> PGC_SH2_type_shift, smfn);
-}
-
-static inline void
-delete_shadow2_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
-/* Remove a shadow from the hash table */
-{
- SHADOW2_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
- v->domain->domain_id, v->vcpu_id,
- mfn_x(gmfn), shadow_type, mfn_x(smfn));
- shadow2_hash_delete(v, mfn_x(gmfn),
- shadow_type >> PGC_SH2_type_shift, smfn);
- put_page(mfn_to_page(gmfn));
-}
-
-/**************************************************************************/
-/* CPU feature support querying */
-
-static inline int
-guest_supports_superpages(struct vcpu *v)
-{
- /* The _PAGE_PSE bit must be honoured in HVM guests, whenever
- * CR4.PSE is set or the guest is in PAE or long mode */
- return (hvm_guest(v) && (GUEST_PAGING_LEVELS != 2
- || (hvm_get_guest_ctrl_reg(v, 4) & X86_CR4_PSE)));
-}
-
-static inline int
-guest_supports_nx(struct vcpu *v)
-{
- if ( !hvm_guest(v) )
- return cpu_has_nx;
-
- // XXX - fix this!
- return 1;
-}
-
-
-/**************************************************************************/
-/* Functions for walking the guest page tables */
-
-
-/* Walk the guest pagetables, filling the walk_t with what we see.
- * Takes an uninitialised walk_t. The caller must call unmap_walk()
- * on the walk_t before discarding it or calling guest_walk_tables again.
- * If "guest_op" is non-zero, we are serving a genuine guest memory access,
- * and must (a) be under the shadow2 lock, and (b) remove write access
- * from any gueat PT pages we see, as we will be using their contents to
- * perform shadow updates.
- * Returns 0 for success or non-zero if the guest pagetables are malformed.
- * N.B. Finding a not-present entry does not cause a non-zero return code. */
-static inline int
-guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw, int guest_op)
-{
- ASSERT(!guest_op || shadow2_lock_is_acquired(v->domain));
-
- perfc_incrc(shadow2_guest_walk);
- memset(gw, 0, sizeof(*gw));
- gw->va = va;
-
-#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
-#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
- /* Get l4e from the top level table */
- gw->l4mfn = pagetable_get_mfn(v->arch.guest_table);
- gw->l4e = (guest_l4e_t *)v->arch.guest_vtable + guest_l4_table_offset(va);
- /* Walk down to the l3e */
- if ( !(guest_l4e_get_flags(*gw->l4e) & _PAGE_PRESENT) ) return 0;
- gw->l3mfn = vcpu_gfn_to_mfn(v, guest_l4e_get_gfn(*gw->l4e));
- if ( !valid_mfn(gw->l3mfn) ) return 1;
- /* This mfn is a pagetable: make sure the guest can't write to it. */
- if ( guest_op && shadow2_remove_write_access(v, gw->l3mfn, 3, va) != 0 )
- flush_tlb_mask(v->domain->domain_dirty_cpumask);
- gw->l3e = ((guest_l3e_t *)sh2_map_domain_page(gw->l3mfn))
- + guest_l3_table_offset(va);
-#else /* PAE only... */
- /* Get l3e from the top level table */
- gw->l3mfn = pagetable_get_mfn(v->arch.guest_table);
- gw->l3e = (guest_l3e_t *)v->arch.guest_vtable + guest_l3_table_offset(va);
-#endif /* PAE or 64... */
- /* Walk down to the l2e */
- if ( !(guest_l3e_get_flags(*gw->l3e) & _PAGE_PRESENT) ) return 0;
- gw->l2mfn = vcpu_gfn_to_mfn(v, guest_l3e_get_gfn(*gw->l3e));
- if ( !valid_mfn(gw->l2mfn) ) return 1;
- /* This mfn is a pagetable: make sure the guest can't write to it. */
- if ( guest_op && shadow2_remove_write_access(v, gw->l2mfn, 2, va) != 0 )
- flush_tlb_mask(v->domain->domain_dirty_cpumask);
- gw->l2e = ((guest_l2e_t *)sh2_map_domain_page(gw->l2mfn))
- + guest_l2_table_offset(va);
-#else /* 32-bit only... */
- /* Get l2e from the top level table */
- gw->l2mfn = pagetable_get_mfn(v->arch.guest_table);
- gw->l2e = (guest_l2e_t *)v->arch.guest_vtable + guest_l2_table_offset(va);
-#endif /* All levels... */
-
- if ( !(guest_l2e_get_flags(*gw->l2e) & _PAGE_PRESENT) ) return 0;
- if ( guest_supports_superpages(v) &&
- (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE) )
- {
- /* Special case: this guest VA is in a PSE superpage, so there's
- * no guest l1e. We make one up so that the propagation code
- * can generate a shadow l1 table. Start with the gfn of the
- * first 4k-page of the superpage. */
- gfn_t start = guest_l2e_get_gfn(*gw->l2e);
- /* Grant full access in the l1e, since all the guest entry's
- * access controls are enforced in the shadow l2e. This lets
- * us reflect l2 changes later without touching the l1s. */
- int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
- _PAGE_ACCESSED|_PAGE_DIRTY);
- /* PSE level 2 entries use bit 12 for PAT; propagate it to bit 7
- * of the level 1 */
- if ( (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE_PAT) )
- flags |= _PAGE_PAT;
- /* Increment the pfn by the right number of 4k pages.
- * The ~0x1 is to mask out the PAT bit mentioned above. */
- start = _gfn((gfn_x(start) & ~0x1) + guest_l1_table_offset(va));
- gw->eff_l1e = guest_l1e_from_gfn(start, flags);
- gw->l1e = NULL;
- gw->l1mfn = _mfn(INVALID_MFN);
- }
- else
- {
- /* Not a superpage: carry on and find the l1e. */
- gw->l1mfn = vcpu_gfn_to_mfn(v, guest_l2e_get_gfn(*gw->l2e));
- if ( !valid_mfn(gw->l1mfn) ) return 1;
- /* This mfn is a pagetable: make sure the guest can't write to it. */
- if ( guest_op
- && shadow2_remove_write_access(v, gw->l1mfn, 1, va) != 0 )
- flush_tlb_mask(v->domain->domain_dirty_cpumask);
- gw->l1e = ((guest_l1e_t *)sh2_map_domain_page(gw->l1mfn))
- + guest_l1_table_offset(va);
- gw->eff_l1e = *gw->l1e;
- }
-
- return 0;
-}
-
-/* Given a walk_t, translate the gw->va into the guest's notion of the
- * corresponding frame number. */
-static inline gfn_t
-guest_walk_to_gfn(walk_t *gw)
-{
- if ( !(guest_l1e_get_flags(gw->eff_l1e) & _PAGE_PRESENT) )
- return _gfn(INVALID_GFN);
- return guest_l1e_get_gfn(gw->eff_l1e);
-}
-
-/* Given a walk_t, translate the gw->va into the guest's notion of the
- * corresponding physical address. */
-static inline paddr_t
-guest_walk_to_gpa(walk_t *gw)
-{
- if ( !(guest_l1e_get_flags(gw->eff_l1e) & _PAGE_PRESENT) )
- return 0;
- return guest_l1e_get_paddr(gw->eff_l1e) + (gw->va & ~PAGE_MASK);
-}
-
-
-/* Unmap (and reinitialise) a guest walk.
- * Call this to dispose of any walk filled in by guest_walk_tables() */
-static void unmap_walk(struct vcpu *v, walk_t *gw)
-{
-#if GUEST_PAGING_LEVELS >= 3
-#if GUEST_PAGING_LEVELS >= 4
- if ( gw->l3e != NULL ) sh2_unmap_domain_page(gw->l3e);
-#endif
- if ( gw->l2e != NULL ) sh2_unmap_domain_page(gw->l2e);
-#endif
- if ( gw->l1e != NULL ) sh2_unmap_domain_page(gw->l1e);
-#ifdef DEBUG
- memset(gw, 0, sizeof(*gw));
-#endif
-}
-
-
-/* Pretty-print the contents of a guest-walk */
-static inline void print_gw(walk_t *gw)
-{
- SHADOW2_PRINTK("GUEST WALK TO %#lx:\n", gw->va);
-#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
-#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
- SHADOW2_PRINTK(" l4mfn=%" SH2_PRI_mfn "\n", mfn_x(gw->l4mfn));
- SHADOW2_PRINTK(" l4e=%p\n", gw->l4e);
- if ( gw->l4e )
- SHADOW2_PRINTK(" *l4e=%" SH2_PRI_gpte "\n", gw->l4e->l4);
-#endif /* PAE or 64... */
- SHADOW2_PRINTK(" l3mfn=%" SH2_PRI_mfn "\n", mfn_x(gw->l3mfn));
- SHADOW2_PRINTK(" l3e=%p\n", gw->l3e);
- if ( gw->l3e )
- SHADOW2_PRINTK(" *l3e=%" SH2_PRI_gpte "\n", gw->l3e->l3);
-#endif /* All levels... */
- SHADOW2_PRINTK(" l2mfn=%" SH2_PRI_mfn "\n", mfn_x(gw->l2mfn));
- SHADOW2_PRINTK(" l2e=%p\n", gw->l2e);
- if ( gw->l2e )
- SHADOW2_PRINTK(" *l2e=%" SH2_PRI_gpte "\n", gw->l2e->l2);
- SHADOW2_PRINTK(" l1mfn=%" SH2_PRI_mfn "\n", mfn_x(gw->l1mfn));
- SHADOW2_PRINTK(" l1e=%p\n", gw->l1e);
- if ( gw->l1e )
- SHADOW2_PRINTK(" *l1e=%" SH2_PRI_gpte "\n", gw->l1e->l1);
- SHADOW2_PRINTK(" eff_l1e=%" SH2_PRI_gpte "\n", gw->eff_l1e.l1);
-}
-
-
-#if SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES
-/* Lightweight audit: pass all the shadows associated with this guest walk
- * through the audit mechanisms */
-static void sh2_audit_gw(struct vcpu *v, walk_t *gw)
-{
- mfn_t smfn;
-
- if ( !(SHADOW2_AUDIT_ENABLE) )
- return;
-
-#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
-#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
- if ( valid_mfn(gw->l4mfn)
- && valid_mfn((smfn = get_shadow_status(v, gw->l4mfn,
- PGC_SH2_l4_shadow))) )
- (void) sh2_audit_l4_table(v, smfn, _mfn(INVALID_MFN));
-#endif /* PAE or 64... */
- if ( valid_mfn(gw->l3mfn)
- && valid_mfn((smfn = get_shadow_status(v, gw->l3mfn,
- PGC_SH2_l3_shadow))) )
- (void) sh2_audit_l3_table(v, smfn, _mfn(INVALID_MFN));
-#endif /* All levels... */
- if ( valid_mfn(gw->l2mfn) )
- {
- if ( valid_mfn((smfn = get_shadow_status(v, gw->l2mfn,
- PGC_SH2_l2_shadow))) )
- (void) sh2_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
-#if GUEST_PAGING_LEVELS == 3
- if ( valid_mfn((smfn = get_shadow_status(v, gw->l2mfn,
- PGC_SH2_l2h_shadow))) )
- (void) sh2_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
-#endif
- }
- if ( valid_mfn(gw->l1mfn)
- && valid_mfn((smfn = get_shadow_status(v, gw->l1mfn,
- PGC_SH2_l1_shadow))) )
- (void) sh2_audit_l1_table(v, smfn, _mfn(INVALID_MFN));
- else if ( gw->l2e
- && (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE)
- && valid_mfn(
- (smfn = get_fl1_shadow_status(v, guest_l2e_get_gfn(*gw->l2e)))) )
- (void) sh2_audit_fl1_table(v, smfn, _mfn(INVALID_MFN));
-}
-
-#else
-#define sh2_audit_gw(_v, _gw) do {} while(0)
-#endif /* audit code */
-
-
-
-/**************************************************************************/
-/* Function to write to the guest tables, for propagating accessed and
- * dirty bits from the shadow to the guest.
- * Takes a guest mfn, a pointer to the guest entry, the level of pagetable,
- * and an operation type. The guest entry is always passed as an l1e:
- * since we only ever write flags, that's OK.
- * Returns the new flag bits of the guest entry. */
-
-static u32 guest_set_ad_bits(struct vcpu *v,
- mfn_t gmfn,
- guest_l1e_t *ep,
- unsigned int level,
- fetch_type_t ft)
-{
- u32 flags, shflags, bit;
- struct page_info *pg;
- int res = 0;
-
- ASSERT(valid_mfn(gmfn)
- && (sh2_mfn_is_a_page_table(gmfn)
- || ((mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask)
- == 0)));
- ASSERT(ep && !(((unsigned long)ep) & ((sizeof *ep) - 1)));
- ASSERT(level <= GUEST_PAGING_LEVELS);
- ASSERT(ft == ft_demand_read || ft == ft_demand_write);
- ASSERT(shadow2_lock_is_acquired(v->domain));
-
- flags = guest_l1e_get_flags(*ep);
-
- /* PAE l3s do not have A and D bits */
- if ( unlikely(GUEST_PAGING_LEVELS == 3 && level == 3) )
- return flags;
-
- /* Need the D bit as well for writes, in l1es and 32bit/PAE PSE l2es. */
- if ( ft == ft_demand_write
- && (level == 1 ||
- (level == 2 && GUEST_PAGING_LEVELS < 4
- && (flags & _PAGE_PSE) && guest_supports_superpages(v))) )
- {
- if ( (flags & (_PAGE_DIRTY | _PAGE_ACCESSED))
- == (_PAGE_DIRTY | _PAGE_ACCESSED) )
- return flags; /* Guest already has A and D bits set */
- flags |= _PAGE_DIRTY | _PAGE_ACCESSED;
- perfc_incrc(shadow2_ad_update);
- }
- else
- {
- if ( flags & _PAGE_ACCESSED )
- return flags; /* Guest already has A bit set */
- flags |= _PAGE_ACCESSED;
- perfc_incrc(shadow2_a_update);
- }
-
- /* Set the bit(s) */
- sh2_mark_dirty(v->domain, gmfn);
- SHADOW2_DEBUG(A_AND_D, "gfn = %"SH2_PRI_gfn", "
- "old flags = %#x, new flags = %#x\n",
- guest_l1e_get_gfn(*ep), guest_l1e_get_flags(*ep), flags);
- *ep = guest_l1e_from_gfn(guest_l1e_get_gfn(*ep), flags);
-
- /* May need to propagate this change forward to other kinds of shadow */
- pg = mfn_to_page(gmfn);
- if ( !sh2_mfn_is_a_page_table(gmfn) )
- {
- /* This guest pagetable is not yet shadowed at all. */
- // MAF: I think this assert is busted... If this gmfn has not yet
- // been promoted, then it seems perfectly reasonable for there to be
- // outstanding type refs to it...
- /* TJD: No. If the gmfn has not been promoted, we must at least
- * have recognised that it is a pagetable, and pulled write access.
- * The type count should only be non-zero if it is actually a page
- * table. The test above was incorrect, though, so I've fixed it. */
- ASSERT((pg->u.inuse.type_info & PGT_count_mask) == 0);
- return flags;
- }
-
- shflags = pg->shadow2_flags & SH2F_page_type_mask;
- while ( shflags )
- {
- bit = find_first_set_bit(shflags);
- ASSERT(shflags & (1u << bit));
- shflags &= ~(1u << bit);
- if ( !(pg->shadow2_flags & (1u << bit)) )
- continue;
- switch ( bit )
- {
- case PGC_SH2_type_to_index(PGC_SH2_l1_shadow):
- if (level != 1)
- res |= sh2_map_and_validate_gl1e(v, gmfn, ep, sizeof (*ep));
- break;
- case PGC_SH2_type_to_index(PGC_SH2_l2_shadow):
- if (level != 2)
- res |= sh2_map_and_validate_gl2e(v, gmfn, ep, sizeof (*ep));
- break;
-#if GUEST_PAGING_LEVELS == 3 /* PAE only */
- case PGC_SH2_type_to_index(PGC_SH2_l2h_shadow):
- if (level != 2)
- res |= sh2_map_and_validate_gl2he(v, gmfn, ep, sizeof (*ep));
- break;
-#endif
-#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
- case PGC_SH2_type_to_index(PGC_SH2_l3_shadow):
- if (level != 3)
- res |= sh2_map_and_validate_gl3e(v, gmfn, ep, sizeof (*ep));
- break;
-#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
- case PGC_SH2_type_to_index(PGC_SH2_l4_shadow):
- if (level != 4)
- res |= sh2_map_and_validate_gl4e(v, gmfn, ep, sizeof (*ep));
- break;
-#endif
-#endif
- default:
- SHADOW2_ERROR("mfn %"SH2_PRI_mfn" is shadowed in multiple "
- "modes: A&D bits may be out of sync (flags=%#x).\n",
- mfn_x(gmfn), pg->shadow2_flags);
- /* XXX Shadows in other modes will not be updated, so will
- * have their A and D bits out of sync. */
- }
- }
-
- /* We should never need to flush the TLB or recopy PAE entries */
- ASSERT( res == 0 || res == SHADOW2_SET_CHANGED );
- return flags;
-}
-
-/**************************************************************************/
-/* Functions to compute the correct index into a shadow page, given an
- * index into the guest page (as returned by guest_get_index()).
- * This is trivial when the shadow and guest use the same sized PTEs, but
- * gets more interesting when those sizes are mismatched (e.g. 32-bit guest,
- * PAE- or 64-bit shadows).
- *
- * These functions also increment the shadow mfn, when necessary. When PTE
- * sizes are mismatched, it takes 2 shadow L1 pages for a single guest L1
- * page. In this case, we allocate 2 contiguous pages for the shadow L1, and
- * use simple pointer arithmetic on a pointer to the guest L1e to figure out
- * which shadow page we really want. Similarly, when PTE sizes are
- * mismatched, we shadow a guest L2 page with 4 shadow L2 pages. (The easiest
- * way to see this is: a 32-bit guest L2 page maps 4GB of virtual address
- * space, while a PAE- or 64-bit shadow L2 page maps 1GB of virtual address
- * space.)
- *
- * For PAE guests, for every 32-bytes of guest L3 page table, we use 64-bytes
- * of shadow (to store both the shadow, and the info that would normally be
- * stored in page_info fields). This arrangement allows the shadow and the
- * "page_info" fields to always be stored in the same page (in fact, in
- * the same cache line), avoiding an extra call to map_domain_page().
- */
-
-static inline u32
-guest_index(void *ptr)
-{
- return (u32)((unsigned long)ptr & ~PAGE_MASK) / sizeof(guest_l1e_t);
-}
-
-static inline u32
-shadow_l1_index(mfn_t *smfn, u32 guest_index)
-{
-#if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2)
- *smfn = _mfn(mfn_x(*smfn) +
- (guest_index / SHADOW_L1_PAGETABLE_ENTRIES));
- return (guest_index % SHADOW_L1_PAGETABLE_ENTRIES);
-#else
- return guest_index;
-#endif
-}
-
-static inline u32
-shadow_l2_index(mfn_t *smfn, u32 guest_index)
-{
-#if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2)
- // Because we use 2 shadow l2 entries for each guest entry, the number of
- // guest entries per shadow page is SHADOW_L2_PAGETABLE_ENTRIES/2
- //
- *smfn = _mfn(mfn_x(*smfn) +
- (guest_index / (SHADOW_L2_PAGETABLE_ENTRIES / 2)));
-
- // We multiple by two to get the index of the first of the two entries
- // used to shadow the specified guest entry.
- return (guest_index % (SHADOW_L2_PAGETABLE_ENTRIES / 2)) * 2;
-#else
- return guest_index;
-#endif
-}
-
-#if GUEST_PAGING_LEVELS >= 3
-
-static inline u32
-shadow_l3_index(mfn_t *smfn, u32 guest_index)
-{
-#if GUEST_PAGING_LEVELS == 3
- u32 group_id;
-
- // Because we use twice the space in L3 shadows as was consumed in guest
- // L3s, the number of guest entries per shadow page is
- // SHADOW_L2_PAGETABLE_ENTRIES/2. (Note this is *not*
- // SHADOW_L3_PAGETABLE_ENTRIES, which in this case is 4...)
- //
- *smfn = _mfn(mfn_x(*smfn) +
- (guest_index / (SHADOW_L2_PAGETABLE_ENTRIES / 2)));
-
- // We store PAE L3 shadows in groups of 4, alternating shadows and
- // pae_l3_bookkeeping structs. So the effective shadow index is
- // the the group_id * 8 + the offset within the group.
- //
- guest_index %= (SHADOW_L2_PAGETABLE_ENTRIES / 2);
- group_id = guest_index / 4;
- return (group_id * 8) + (guest_index % 4);
-#else
- return guest_index;
-#endif
-}
-
-#endif // GUEST_PAGING_LEVELS >= 3
-
-#if GUEST_PAGING_LEVELS >= 4
-
-static inline u32
-shadow_l4_index(mfn_t *smfn, u32 guest_index)
-{
- return guest_index;
-}
-
-#endif // GUEST_PAGING_LEVELS >= 4
-
-
-/**************************************************************************/
-/* Functions which compute shadow entries from their corresponding guest
- * entries.
- *
- * These are the "heart" of the shadow code.
- *
- * There are two sets of these: those that are called on demand faults (read
- * faults and write faults), and those that are essentially called to
- * "prefetch" (or propagate) entries from the guest into the shadow. The read
- * fault and write fault are handled as two separate cases for L1 entries (due
- * to the _PAGE_DIRTY bit handling), but for L[234], they are grouped together
- * into the respective demand_fault functions.
- */
-
-#define CHECK(_cond) \
-do { \
- if (unlikely(!(_cond))) \
- { \
- printk("%s %s %d ASSERTION (%s) FAILED\n", \
- __func__, __FILE__, __LINE__, #_cond); \
- return -1; \
- } \
-} while (0);
-
-// The function below tries to capture all of the flag manipulation for the
-// demand and propagate functions into one place.
-//
-static always_inline u32
-sh2_propagate_flags(struct vcpu *v, mfn_t target_mfn,
- u32 gflags, guest_l1e_t *guest_entry_ptr, mfn_t gmfn,
- int mmio, int level, fetch_type_t ft)
-{
- struct domain *d = v->domain;
- u32 pass_thru_flags;
- u32 sflags;
-
- // XXX -- might want to think about PAT support for HVM guests...
-
-#ifndef NDEBUG
- // MMIO can only occur from L1e's
- //
- if ( mmio )
- CHECK(level == 1);
-
- // We should always have a pointer to the guest entry if it's a non-PSE
- // non-MMIO demand access.
- if ( ft & FETCH_TYPE_DEMAND )
- CHECK(guest_entry_ptr || level == 1);
-#endif
-
- // A not-present guest entry has a special signature in the shadow table,
- // so that we do not have to consult the guest tables multiple times...
- //
- if ( unlikely(!(gflags & _PAGE_PRESENT)) )
- return _PAGE_SHADOW_GUEST_NOT_PRESENT;
-
- // Must have a valid target_mfn, unless this is mmio, or unless this is a
- // prefetch. In the case of a prefetch, an invalid mfn means that we can
- // not usefully shadow anything, and so we return early.
- //
- if ( !valid_mfn(target_mfn) )
- {
- CHECK((ft == ft_prefetch) || mmio);
- if ( !mmio )
- return 0;
- }
-
- // PAE does not allow NX, RW, USER, ACCESSED, or DIRTY bits in its L3e's...
- //
- if ( (SHADOW_PAGING_LEVELS == 3) && (level == 3) )
- pass_thru_flags = _PAGE_PRESENT;
- else
- {
- pass_thru_flags = (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_USER |
- _PAGE_RW | _PAGE_PRESENT);
- if ( guest_supports_nx(v) )
- pass_thru_flags |= _PAGE_NX_BIT;
- }
-
- // PAE guests can not put NX, RW, USER, ACCESSED, or DIRTY bits into their
- // L3e's; they are all implied. So we emulate them here.
- //
- if ( (GUEST_PAGING_LEVELS == 3) && (level == 3) )
- gflags = pass_thru_flags;
-
- // Propagate bits from the guest to the shadow.
- // Some of these may be overwritten, below.
- // Since we know the guest's PRESENT bit is set, we also set the shadow's
- // SHADOW_PRESENT bit.
- //
- sflags = (gflags & pass_thru_flags) | _PAGE_SHADOW_PRESENT;
-
- // Copy the guest's RW bit into the SHADOW_RW bit.
- //
- if ( gflags & _PAGE_RW )
- sflags |= _PAGE_SHADOW_RW;
-
- // Set the A&D bits for higher level shadows.
- // Higher level entries do not, strictly speaking, have dirty bits, but
- // since we use shadow linear tables, each of these entries may, at some
- // point in time, also serve as a shadow L1 entry.
- // By setting both the A&D bits in each of these, we eliminate the burden
- // on the hardware to update these bits on initial accesses.
- //
- if ( (level > 1) && !((SHADOW_PAGING_LEVELS == 3) && (level == 3)) )
- sflags |= _PAGE_ACCESSED | _PAGE_DIRTY;
-
-
- // Set the A and D bits in the guest entry, if we need to.
- if ( guest_entry_ptr && (ft & FETCH_TYPE_DEMAND) )
- gflags = guest_set_ad_bits(v, gmfn, guest_entry_ptr, level, ft);
-
- // If the A or D bit has not yet been set in the guest, then we must
- // prevent the corresponding kind of access.
- //
- if ( unlikely(!((GUEST_PAGING_LEVELS == 3) && (level == 3)) &&
- !(gflags & _PAGE_ACCESSED)) )
- sflags &= ~_PAGE_PRESENT;
-
- /* D bits exist in l1es, and 32bit/PAE PSE l2es, but not 64bit PSE l2es */
- if ( unlikely( ((level == 1)
- || ((level == 2) && (GUEST_PAGING_LEVELS < 4)
- && guest_supports_superpages(v) &&
- (gflags & _PAGE_PSE)))
- && !(gflags & _PAGE_DIRTY)) )
- sflags &= ~_PAGE_RW;
-
- // MMIO caching
- //
- // MMIO mappings are marked as not present, but we set the SHADOW_MMIO bit
- // to cache the fact that this entry is in MMIO space.
- //
- if ( (level == 1) && mmio )
- {
- sflags &= ~(_PAGE_PRESENT);
- sflags |= _PAGE_SHADOW_MMIO;
- }
- else
- {
- // shadow2_mode_log_dirty support
- //
- // Only allow the guest write access to a page a) on a demand fault,
- // or b) if the page is already marked as dirty.
- //
- if ( unlikely((level == 1) &&
- !(ft & FETCH_TYPE_WRITE) &&
- shadow2_mode_log_dirty(d) &&
- !sh2_mfn_is_dirty(d, target_mfn)) )
- {
- sflags &= ~_PAGE_RW;
- }
-
- // protect guest page tables
- //
- if ( unlikely((level == 1) &&
- sh2_mfn_is_a_page_table(target_mfn)) )
- {
- if ( shadow2_mode_trap_reads(d) )
- {
- // if we are trapping both reads & writes, then mark this page
- // as not present...
- //
- sflags &= ~_PAGE_PRESENT;
- }
- else
- {
- // otherwise, just prevent any writes...
- //
- sflags &= ~_PAGE_RW;
- }
- }
- }
-
- return sflags;
-}
-
-#undef CHECK
-
-#if GUEST_PAGING_LEVELS >= 4
-static void
-l4e_propagate_from_guest(struct vcpu *v,
- guest_l4e_t *gl4e,
- mfn_t gl4mfn,
- mfn_t sl3mfn,
- shadow_l4e_t *sl4p,
- fetch_type_t ft)
-{
- u32 gflags = guest_l4e_get_flags(*gl4e);
- u32 sflags = sh2_propagate_flags(v, sl3mfn, gflags, (guest_l1e_t *) gl4e,
- gl4mfn, 0, 4, ft);
-
- *sl4p = shadow_l4e_from_mfn(sl3mfn, sflags);
-
- SHADOW2_DEBUG(PROPAGATE,
- "%s gl4e=%" SH2_PRI_gpte " sl4e=%" SH2_PRI_pte "\n",
- fetch_type_names[ft], gl4e->l4, sl4p->l4);
- ASSERT(sflags != -1);
-}
-#endif // GUEST_PAGING_LEVELS >= 4
-
-#if GUEST_PAGING_LEVELS >= 3
-static void
-l3e_propagate_from_guest(struct vcpu *v,
- guest_l3e_t *gl3e,
- mfn_t gl3mfn,
- mfn_t sl2mfn,
- shadow_l3e_t *sl3p,
- fetch_type_t ft)
-{
- u32 gflags = guest_l3e_get_flags(*gl3e);
- u32 sflags = sh2_propagate_flags(v, sl2mfn, gflags, (guest_l1e_t *) gl3e,
- gl3mfn, 0, 3, ft);
-
- *sl3p = shadow_l3e_from_mfn(sl2mfn, sflags);
-
- SHADOW2_DEBUG(PROPAGATE,
- "%s gl3e=%" SH2_PRI_gpte " sl3e=%" SH2_PRI_pte "\n",
- fetch_type_names[ft], gl3e->l3, sl3p->l3);
- ASSERT(sflags != -1);
-}
-#endif // GUEST_PAGING_LEVELS >= 3
-
-static void
-l2e_propagate_from_guest(struct vcpu *v,
- guest_l2e_t *gl2e,
- mfn_t gl2mfn,
- mfn_t sl1mfn,
- shadow_l2e_t *sl2p,
- fetch_type_t ft)
-{
- u32 gflags = guest_l2e_get_flags(*gl2e);
- u32 sflags = sh2_propagate_flags(v, sl1mfn, gflags, (guest_l1e_t *) gl2e,
- gl2mfn, 0, 2, ft);
-
- *sl2p = shadow_l2e_from_mfn(sl1mfn, sflags);
-
- SHADOW2_DEBUG(PROPAGATE,
- "%s gl2e=%" SH2_PRI_gpte " sl2e=%" SH2_PRI_pte "\n",
- fetch_type_names[ft], gl2e->l2, sl2p->l2);
- ASSERT(sflags != -1);
-}
-
-static inline int
-l1e_read_fault(struct vcpu *v, walk_t *gw, mfn_t gmfn, shadow_l1e_t *sl1p,
- int mmio)
-/* returns 1 if emulation is required, and 0 otherwise */
-{
- struct domain *d = v->domain;
- u32 gflags = guest_l1e_get_flags(gw->eff_l1e);
- u32 sflags = sh2_propagate_flags(v, gmfn, gflags, gw->l1e, gw->l1mfn,
- mmio, 1, ft_demand_read);
-
- if ( shadow2_mode_trap_reads(d) && !mmio && sh2_mfn_is_a_page_table(gmfn) )
- {
- // emulation required!
- *sl1p = shadow_l1e_empty();
- return 1;
- }
-
- *sl1p = shadow_l1e_from_mfn(gmfn, sflags);
-
- SHADOW2_DEBUG(PROPAGATE,
- "va=%p eff_gl1e=%" SH2_PRI_gpte " sl1e=%" SH2_PRI_pte "\n",
- (void *)gw->va, gw->eff_l1e.l1, sl1p->l1);
-
- ASSERT(sflags != -1);
- return 0;
-}
-
-static inline int
-l1e_write_fault(struct vcpu *v, walk_t *gw, mfn_t gmfn, shadow_l1e_t *sl1p,
- int mmio)
-/* returns 1 if emulation is required, and 0 otherwise */
-{
- struct domain *d = v->domain;
- u32 gflags = guest_l1e_get_flags(gw->eff_l1e);
- u32 sflags = sh2_propagate_flags(v, gmfn, gflags, gw->l1e, gw->l1mfn,
- mmio, 1, ft_demand_write);
-
- sh2_mark_dirty(d, gmfn);
-
- if ( !mmio && sh2_mfn_is_a_page_table(gmfn) )
- {
- // emulation required!
- *sl1p = shadow_l1e_empty();
- return 1;
- }
-
- *sl1p = shadow_l1e_from_mfn(gmfn, sflags);
-
- SHADOW2_DEBUG(PROPAGATE,
- "va=%p eff_gl1e=%" SH2_PRI_gpte " sl1e=%" SH2_PRI_pte "\n",
- (void *)gw->va, gw->eff_l1e.l1, sl1p->l1);
-
- ASSERT(sflags != -1);
- return 0;
-}
-
-static inline void
-l1e_propagate_from_guest(struct vcpu *v, guest_l1e_t gl1e, shadow_l1e_t *sl1p,
- int mmio)
-{
- gfn_t gfn = guest_l1e_get_gfn(gl1e);
- mfn_t gmfn = (mmio) ? _mfn(gfn_x(gfn)) : vcpu_gfn_to_mfn(v, gfn);
- u32 gflags = guest_l1e_get_flags(gl1e);
- u32 sflags = sh2_propagate_flags(v, gmfn, gflags, 0, _mfn(INVALID_MFN),
- mmio, 1, ft_prefetch);
-
- *sl1p = shadow_l1e_from_mfn(gmfn, sflags);
-
- SHADOW2_DEBUG(PROPAGATE,
- "gl1e=%" SH2_PRI_gpte " sl1e=%" SH2_PRI_pte "\n",
- gl1e.l1, sl1p->l1);
-
- ASSERT(sflags != -1);
-}
-
-
-/**************************************************************************/
-/* These functions update shadow entries (and do bookkeeping on the shadow
- * tables they are in). It is intended that they are the only
- * functions which ever write (non-zero) data onto a shadow page.
- *
- * They return a set of flags:
- * SHADOW2_SET_CHANGED -- we actually wrote a new value to the shadow.
- * SHADOW2_SET_FLUSH -- the caller must cause a TLB flush.
- * SHADOW2_SET_ERROR -- the input is not a valid entry (for example, if
- * shadow2_get_page_from_l1e() fails).
- * SHADOW2_SET_L3PAE_RECOPY -- one or more vcpu's need to have their local
- * copies of their PAE L3 entries re-copied.
- */
-
-static inline void safe_write_entry(void *dst, void *src)
-/* Copy one PTE safely when processors might be running on the
- * destination pagetable. This does *not* give safety against
- * concurrent writes (that's what the shadow lock is for), just
- * stops the hardware picking up partially written entries. */
-{
- volatile unsigned long *d = dst;
- unsigned long *s = src;
- ASSERT(!((unsigned long) d & (sizeof (shadow_l1e_t) - 1)));
-#if CONFIG_PAGING_LEVELS == 3
- /* In PAE mode, pagetable entries are larger
- * than machine words, so won't get written atomically. We need to make
- * sure any other cpu running on these shadows doesn't see a
- * half-written entry. Do this by marking the entry not-present first,
- * then writing the high word before the low word. */
- BUILD_BUG_ON(sizeof (shadow_l1e_t) != 2 * sizeof (unsigned long));
- d[0] = 0;
- d[1] = s[1];
- d[0] = s[0];
-#else
- /* In 32-bit and 64-bit, sizeof(pte) == sizeof(ulong) == 1 word,
- * which will be an atomic write, since the entry is aligned. */
- BUILD_BUG_ON(sizeof (shadow_l1e_t) != sizeof (unsigned long));
- *d = *s;
-#endif
-}
-
-
-static inline void
-shadow_write_entries(void *d, void *s, int entries, mfn_t mfn)
-/* This function does the actual writes to shadow pages.
- * It must not be called directly, since it doesn't do the bookkeeping
- * that shadow_set_l*e() functions do. */
-{
- shadow_l1e_t *dst = d;
- shadow_l1e_t *src = s;
- void *map = NULL;
- int i;
-
- /* Because we mirror access rights at all levels in the shadow, an
- * l2 (or higher) entry with the RW bit cleared will leave us with
- * no write access through the linear map.
- * We detect that by writing to the shadow with copy_to_user() and
- * using map_domain_page() to get a writeable mapping if we need to. */
- if ( __copy_to_user(d, d, sizeof (unsigned long)) != 0 )
- {
- perfc_incrc(shadow2_linear_map_failed);
- map = sh2_map_domain_page(mfn);
- ASSERT(map != NULL);
- dst = map + ((unsigned long)dst & (PAGE_SIZE - 1));
- }
-
-
- for ( i = 0; i < entries; i++ )
- safe_write_entry(dst++, src++);
-
- if ( map != NULL ) sh2_unmap_domain_page(map);
-
- /* XXX TODO:
- * Update min/max field in page_info struct of this mfn */
-}
-
-static inline int
-perms_strictly_increased(u32 old_flags, u32 new_flags)
-/* Given the flags of two entries, are the new flags a strict
- * increase in rights over the old ones? */
-{
- u32 of = old_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
- u32 nf = new_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
- /* Flip the NX bit, since it's the only one that decreases rights;
- * we calculate as if it were an "X" bit. */
- of ^= _PAGE_NX_BIT;
- nf ^= _PAGE_NX_BIT;
- /* If the changed bits are all set in the new flags, then rights strictly
- * increased between old and new. */
- return ((of | (of ^ nf)) == nf);
-}
-
-static int inline
-shadow2_get_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
-{
- int res;
- mfn_t mfn;
- struct domain *owner;
- shadow_l1e_t sanitized_sl1e =
- shadow_l1e_remove_flags(sl1e, _PAGE_SHADOW_RW | _PAGE_SHADOW_PRESENT);
-
- //ASSERT(shadow_l1e_get_flags(sl1e) & _PAGE_PRESENT);
- //ASSERT((shadow_l1e_get_flags(sl1e) & L1_DISALLOW_MASK) == 0);
-
- if ( !shadow2_mode_refcounts(d) )
- return 1;
-
- res = get_page_from_l1e(sanitized_sl1e, d);
-
- // If a privileged domain is attempting to install a map of a page it does
- // not own, we let it succeed anyway.
- //
- if ( unlikely(!res) &&
- IS_PRIV(d) &&
- !shadow2_mode_translate(d) &&
- valid_mfn(mfn = shadow_l1e_get_mfn(sl1e)) &&
- (owner = page_get_owner(mfn_to_page(mfn))) &&
- (d != owner) )
- {
- res = get_page_from_l1e(sanitized_sl1e, owner);
- SHADOW2_PRINTK("privileged domain %d installs map of mfn %05lx "
- "which is owned by domain %d: %s\n",
- d->domain_id, mfn_x(mfn), owner->domain_id,
- res ? "success" : "failed");
- }
-
- if ( unlikely(!res) )
- {
- perfc_incrc(shadow2_get_page_fail);
- SHADOW2_PRINTK("failed: l1e=" SH2_PRI_pte "\n");
- }
-
- return res;
-}
-
-static void inline
-shadow2_put_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
-{
- if ( !shadow2_mode_refcounts(d) )
- return;
-
- put_page_from_l1e(sl1e, d);
-}
-
-#if GUEST_PAGING_LEVELS >= 4
-static int shadow_set_l4e(struct vcpu *v,
- shadow_l4e_t *sl4e,
- shadow_l4e_t new_sl4e,
- mfn_t sl4mfn)
-{
- int flags = 0;
- shadow_l4e_t old_sl4e;
- paddr_t paddr;
- ASSERT(sl4e != NULL);
- old_sl4e = *sl4e;
-
- if ( old_sl4e.l4 == new_sl4e.l4 ) return 0; /* Nothing to do */
-
- paddr = ((((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT)
- | (((unsigned long)sl4e) & ~PAGE_MASK));
-
- if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT )
- {
- /* About to install a new reference */
- sh2_get_ref(shadow_l4e_get_mfn(new_sl4e), paddr);
- }
-
- /* Write the new entry */
- shadow_write_entries(sl4e, &new_sl4e, 1, sl4mfn);
- flags |= SHADOW2_SET_CHANGED;
-
- if ( shadow_l4e_get_flags(old_sl4e) & _PAGE_PRESENT )
- {
- /* We lost a reference to an old mfn. */
- mfn_t osl3mfn = shadow_l4e_get_mfn(old_sl4e);
- if ( (mfn_x(osl3mfn) != mfn_x(shadow_l4e_get_mfn(new_sl4e)))
- || !perms_strictly_increased(shadow_l4e_get_flags(old_sl4e),
- shadow_l4e_get_flags(new_sl4e)) )
- {
- flags |= SHADOW2_SET_FLUSH;
- }
- sh2_put_ref(v, osl3mfn, paddr);
- }
- return flags;
-}
-#endif /* GUEST_PAGING_LEVELS >= 4 */
-
-#if GUEST_PAGING_LEVELS >= 3
-static int shadow_set_l3e(struct vcpu *v,
- shadow_l3e_t *sl3e,
- shadow_l3e_t new_sl3e,
- mfn_t sl3mfn)
-{
- int flags = 0;
- shadow_l3e_t old_sl3e;
- paddr_t paddr;
- ASSERT(sl3e != NULL);
- old_sl3e = *sl3e;
-
- if ( old_sl3e.l3 == new_sl3e.l3 ) return 0; /* Nothing to do */
-
- paddr = ((((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT)
- | (((unsigned long)sl3e) & ~PAGE_MASK));
-
- if ( shadow_l3e_get_flags(new_sl3e) & _PAGE_PRESENT )
- {
- /* About to install a new reference */
- sh2_get_ref(shadow_l3e_get_mfn(new_sl3e), paddr);
- }
-
- /* Write the new entry */
- shadow_write_entries(sl3e, &new_sl3e, 1, sl3mfn);
- flags |= SHADOW2_SET_CHANGED;
-
-#if GUEST_PAGING_LEVELS == 3
- /* We wrote a guest l3e in a PAE pagetable. This table is copied in
- * the linear pagetable entries of its l2s, and may also be copied
- * to a low memory location to make it fit in CR3. Report that we
- * need to resync those copies (we can't wait for the guest to flush
- * the TLB because it might be an increase in rights). */
- {
- struct vcpu *vcpu;
-
- struct pae_l3_bookkeeping *info = sl3p_to_info(sl3e);
- for_each_vcpu(v->domain, vcpu)
- {
- if (info->vcpus & (1 << vcpu->vcpu_id))
- {
- // Remember that this flip/update needs to occur.
- vcpu->arch.shadow2.pae_flip_pending = 1;
- flags |= SHADOW2_SET_L3PAE_RECOPY;
- }
- }
- }
-#endif
-
- if ( shadow_l3e_get_flags(old_sl3e) & _PAGE_PRESENT )
- {
- /* We lost a reference to an old mfn. */
- mfn_t osl2mfn = shadow_l3e_get_mfn(old_sl3e);
- if ( (mfn_x(osl2mfn) != mfn_x(shadow_l3e_get_mfn(new_sl3e))) ||
- !perms_strictly_increased(shadow_l3e_get_flags(old_sl3e),
- shadow_l3e_get_flags(new_sl3e)) )
- {
- flags |= SHADOW2_SET_FLUSH;
- }
- sh2_put_ref(v, osl2mfn, paddr);
- }
- return flags;
-}
-#endif /* GUEST_PAGING_LEVELS >= 3 */
-
-static int shadow_set_l2e(struct vcpu *v,
- shadow_l2e_t *sl2e,
- shadow_l2e_t new_sl2e,
- mfn_t sl2mfn)
-{
- int flags = 0;
- shadow_l2e_t old_sl2e;
- paddr_t paddr;
-
-#if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
- /* In 2-on-3 we work with pairs of l2es pointing at two-page
- * shadows. Reference counting and up-pointers track from the first
- * page of the shadow to the first l2e, so make sure that we're
- * working with those:
- * Align the pointer down so it's pointing at the first of the pair */
- sl2e = (shadow_l2e_t *)((unsigned long)sl2e & ~(sizeof(shadow_l2e_t)));
- /* Align the mfn of the shadow entry too */
- new_sl2e.l2 &= ~(1<<PAGE_SHIFT);
-#endif
-
- ASSERT(sl2e != NULL);
- old_sl2e = *sl2e;
-
- if ( old_sl2e.l2 == new_sl2e.l2 ) return 0; /* Nothing to do */
-
- paddr = ((((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
- | (((unsigned long)sl2e) & ~PAGE_MASK));
-
- if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT )
- {
- /* About to install a new reference */
- sh2_get_ref(shadow_l2e_get_mfn(new_sl2e), paddr);
- }
-
- /* Write the new entry */
-#if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
- {
- shadow_l2e_t pair[2] = { new_sl2e, new_sl2e };
- /* The l1 shadow is two pages long and need to be pointed to by
- * two adjacent l1es. The pair have the same flags, but point
- * at odd and even MFNs */
- ASSERT(!(pair[0].l2 & (1<<PAGE_SHIFT)));
- pair[1].l2 |= (1<<PAGE_SHIFT);
- shadow_write_entries(sl2e, &pair, 2, sl2mfn);
- }
-#else /* normal case */
- shadow_write_entries(sl2e, &new_sl2e, 1, sl2mfn);
-#endif
- flags |= SHADOW2_SET_CHANGED;
-
- if ( shadow_l2e_get_flags(old_sl2e) & _PAGE_PRESENT )
- {
- /* We lost a reference to an old mfn. */
- mfn_t osl1mfn = shadow_l2e_get_mfn(old_sl2e);
- if ( (mfn_x(osl1mfn) != mfn_x(shadow_l2e_get_mfn(new_sl2e))) ||
- !perms_strictly_increased(shadow_l2e_get_flags(old_sl2e),
- shadow_l2e_get_flags(new_sl2e)) )
- {
- flags |= SHADOW2_SET_FLUSH;
- }
- sh2_put_ref(v, osl1mfn, paddr);
- }
- return flags;
-}
-
-static int shadow_set_l1e(struct vcpu *v,
- shadow_l1e_t *sl1e,
- shadow_l1e_t new_sl1e,
- mfn_t sl1mfn)
-{
- int flags = 0;
- struct domain *d = v->domain;
- shadow_l1e_t old_sl1e;
- ASSERT(sl1e != NULL);
-
- old_sl1e = *sl1e;
-
- if ( old_sl1e.l1 == new_sl1e.l1 ) return 0; /* Nothing to do */
-
- if ( shadow_l1e_get_flags(new_sl1e) & _PAGE_PRESENT )
- {
- /* About to install a new reference */
- if ( shadow2_mode_refcounts(d) ) {
- if ( shadow2_get_page_from_l1e(new_sl1e, d) == 0 )
- {
- /* Doesn't look like a pagetable. */
- flags |= SHADOW2_SET_ERROR;
- new_sl1e = shadow_l1e_empty();
- }
- }
- }
-
- /* Write the new entry */
- shadow_write_entries(sl1e, &new_sl1e, 1, sl1mfn);
- flags |= SHADOW2_SET_CHANGED;
-
- if ( shadow_l1e_get_flags(old_sl1e) & _PAGE_PRESENT )
- {
- /* We lost a reference to an old mfn. */
- /* N.B. Unlike higher-level sets, never need an extra flush
- * when writing an l1e. Because it points to the same guest frame
- * as the guest l1e did, it's the guest's responsibility to
- * trigger a flush later. */
- if ( shadow2_mode_refcounts(d) )
- {
- shadow2_put_page_from_l1e(old_sl1e, d);
- }
- }
- return flags;
-}
-
-
-/**************************************************************************/
-/* These functions take a vcpu and a virtual address, and return a pointer
- * to the appropriate level N entry from the shadow tables.
- * If the necessary tables are not present in the shadow, they return NULL. */
-
-/* N.B. The use of GUEST_PAGING_LEVELS here is correct. If the shadow has
- * more levels than the guest, the upper levels are always fixed and do not
- * reflect any information from the guest, so we do not use these functions
- * to access them. */
-
-#if GUEST_PAGING_LEVELS >= 4
-static shadow_l4e_t *
-shadow_get_l4e(struct vcpu *v, unsigned long va)
-{
- /* Reading the top level table is always valid. */
- return sh2_linear_l4_table(v) + shadow_l4_linear_offset(va);
-}
-#endif /* GUEST_PAGING_LEVELS >= 4 */
-
-
-#if GUEST_PAGING_LEVELS >= 3
-static shadow_l3e_t *
-shadow_get_l3e(struct vcpu *v, unsigned long va)
-{
-#if GUEST_PAGING_LEVELS >= 4 /* 64bit... */
- /* Get the l4 */
- shadow_l4e_t *sl4e = shadow_get_l4e(v, va);
- ASSERT(sl4e != NULL);
- if ( !(shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT) )
- return NULL;
- ASSERT(valid_mfn(shadow_l4e_get_mfn(*sl4e)));
- /* l4 was present; OK to get the l3 */
- return sh2_linear_l3_table(v) + shadow_l3_linear_offset(va);
-#else /* PAE... */
- /* Top level is always mapped */
- ASSERT(v->arch.shadow_vtable);
- return ((shadow_l3e_t *)v->arch.shadow_vtable) + shadow_l3_linear_offset(va);
-#endif
-}
-#endif /* GUEST_PAGING_LEVELS >= 3 */
-
-
-static shadow_l2e_t *
-shadow_get_l2e(struct vcpu *v, unsigned long va)
-{
-#if GUEST_PAGING_LEVELS >= 3 /* 64bit/PAE... */
- /* Get the l3 */
- shadow_l3e_t *sl3e = shadow_get_l3e(v, va);
- if ( sl3e == NULL || !(shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT) )
- return NULL;
- ASSERT(valid_mfn(shadow_l3e_get_mfn(*sl3e)));
- /* l3 was present; OK to get the l2 */
-#endif
- return sh2_linear_l2_table(v) + shadow_l2_linear_offset(va);
-}
-
-
-#if 0 // avoid the compiler warning for now...
-
-static shadow_l1e_t *
-shadow_get_l1e(struct vcpu *v, unsigned long va)
-{
- /* Get the l2 */
- shadow_l2e_t *sl2e = shadow_get_l2e(v, va);
- if ( sl2e == NULL || !(shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT) )
- return NULL;
- ASSERT(valid_mfn(shadow_l2e_get_mfn(*sl2e)));
- /* l2 was present; OK to get the l1 */
- return sh2_linear_l1_table(v) + shadow_l1_linear_offset(va);
-}
-
-#endif
-
-
-/**************************************************************************/
-/* Macros to walk pagetables. These take the shadow of a pagetable and
- * walk every "interesting" entry. That is, they don't touch Xen mappings,
- * and for 32-bit l2s shadowed onto PAE or 64-bit, they only touch every
- * second entry (since pairs of entries are managed together). For multi-page
- * shadows they walk all pages.
- *
- * Arguments are an MFN, the variable to point to each entry, a variable
- * to indicate that we are done (we will shortcut to the end of the scan
- * when _done != 0), a variable to indicate that we should avoid Xen mappings,
- * and the code.
- *
- * WARNING: These macros have side-effects. They change the values of both
- * the pointer and the MFN. */
-
-static inline void increment_ptr_to_guest_entry(void *ptr)
-{
- if ( ptr )
- {
- guest_l1e_t **entry = ptr;
- (*entry)++;
- }
-}
-
-/* All kinds of l1: touch all entries */
-#define _SHADOW2_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
-do { \
- int _i; \
- shadow_l1e_t *_sp = map_shadow_page((_sl1mfn)); \
- ASSERT((mfn_to_page(_sl1mfn)->count_info & PGC_SH2_type_mask) \
- == PGC_SH2_l1_shadow \
- || (mfn_to_page(_sl1mfn)->count_info & PGC_SH2_type_mask) \
- == PGC_SH2_fl1_shadow); \
- for ( _i = 0; _i < SHADOW_L1_PAGETABLE_ENTRIES; _i++ ) \
- { \
- (_sl1e) = _sp + _i; \
- if ( shadow_l1e_get_flags(*(_sl1e)) & _PAGE_PRESENT ) \
- {_code} \
- if ( _done ) break; \
- increment_ptr_to_guest_entry(_gl1p); \
- } \
- unmap_shadow_page(_sp); \
-} while (0)
-
-/* 32-bit l1, on PAE or 64-bit shadows: need to walk both pages of shadow */
-#if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
-#define SHADOW2_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
-do { \
- int __done = 0; \
- _SHADOW2_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \
- ({ (__done = _done); }), _code); \
- _sl1mfn = _mfn(mfn_x(_sl1mfn) + 1); \
- if ( !__done ) \
- _SHADOW2_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \
- ({ (__done = _done); }), _code); \
-} while (0)
-#else /* Everything else; l1 shadows are only one page */
-#define SHADOW2_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
- _SHADOW2_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code)
-#endif
-
-
-#if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
-
-/* 32-bit l2 on PAE/64: four pages, touch every second entry, and avoid Xen */
-#define SHADOW2_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code) \
-do { \
- int _i, _j, __done = 0; \
- ASSERT((mfn_to_page(_sl2mfn)->count_info & PGC_SH2_type_mask) \
- == PGC_SH2_l2_32_shadow); \
- for ( _j = 0; _j < 4 && !__done; _j++ ) \
- { \
- shadow_l2e_t *_sp = map_shadow_page(_sl2mfn); \
- for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i += 2 ) \
- if ( (!(_xen)) \
- || ((_j * SHADOW_L2_PAGETABLE_ENTRIES) + _i) \
- < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT) ) \
- { \
- (_sl2e) = _sp + _i; \
- if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
- {_code} \
- if ( (__done = (_done)) ) break; \
- increment_ptr_to_guest_entry(_gl2p); \
- } \
- unmap_shadow_page(_sp); \
- _sl2mfn = _mfn(mfn_x(_sl2mfn) + 1); \
- } \
-} while (0)
-
-#elif GUEST_PAGING_LEVELS == 2
-
-/* 32-bit on 32-bit: avoid Xen entries */
-#define SHADOW2_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code) \
-do { \
- int _i; \
- shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \
- ASSERT((mfn_to_page(_sl2mfn)->count_info & PGC_SH2_type_mask) \
- == PGC_SH2_l2_32_shadow); \
- for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
- if ( (!(_xen)) \
- || \
- (_i < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \
- { \
- (_sl2e) = _sp + _i; \
- if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
- {_code} \
- if ( _done ) break; \
- increment_ptr_to_guest_entry(_gl2p); \
- } \
- unmap_shadow_page(_sp); \
-} while (0)
-
-#elif GUEST_PAGING_LEVELS == 3
-
-/* PAE: if it's an l2h, don't touch Xen mappings */
-#define SHADOW2_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code) \
-do { \
- int _i; \
- shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \
- ASSERT((mfn_to_page(_sl2mfn)->count_info & PGC_SH2_type_mask) \
- == PGC_SH2_l2_pae_shadow \
- || (mfn_to_page(_sl2mfn)->count_info & PGC_SH2_type_mask) \
- == PGC_SH2_l2h_pae_shadow); \
- for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
- if ( (!(_xen)) \
- || ((mfn_to_page(_sl2mfn)->count_info & PGC_SH2_type_mask) \
- != PGC_SH2_l2h_pae_shadow) \
- || ((_i + (3 * SHADOW_L2_PAGETABLE_ENTRIES)) \
- < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \
- { \
- (_sl2e) = _sp + _i; \
- if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
- {_code} \
- if ( _done ) break; \
- increment_ptr_to_guest_entry(_gl2p); \
- } \
- unmap_shadow_page(_sp); \
-} while (0)
-
-#else
-
-/* 64-bit l2: touch all entries */
-#define SHADOW2_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code) \
-do { \
- int _i; \
- shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \
- ASSERT((mfn_to_page(_sl2mfn)->count_info & PGC_SH2_type_mask) \
- == PGC_SH2_l2_64_shadow); \
- for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
- { \
- (_sl2e) = _sp + _i; \
- if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
- {_code} \
- if ( _done ) break; \
- increment_ptr_to_guest_entry(_gl2p); \
- } \
- unmap_shadow_page(_sp); \
-} while (0)
-
-#endif /* different kinds of l2 */
-
-#if GUEST_PAGING_LEVELS == 3
-
-/* PAE l3 subshadow: touch all entries (FOREACH_L2E will find Xen l2es). */
-#define SHADOW2_FOREACH_L3E_SUB(_sl3e, _gl3p, _done, _code) \
-do { \
- int _i; \
- for ( _i = 0; _i < 4; _i++ ) \
- { \
- if ( shadow_l3e_get_flags(*(_sl3e)) & _PAGE_PRESENT ) \
- {_code} \
- if ( _done ) break; \
- _sl3e++; \
- increment_ptr_to_guest_entry(_gl3p); \
- } \
-} while (0)
-
-/* PAE l3 full shadow: call subshadow walk on all valid l3 subshadows */
-#define SHADOW2_FOREACH_L3E(_sl3mfn, _sl3e, _gl3p, _done, _code) \
-do { \
- int _i, _j, _k, __done = 0; \
- ASSERT((mfn_to_page(_sl3mfn)->count_info & PGC_SH2_type_mask) \
- == PGC_SH2_l3_pae_shadow); \
- /* The subshadows are split, 64 on each page of the shadow */ \
- for ( _j = 0; _j < 2 && !__done; _j++ ) \
- { \
- void *_sp = sh2_map_domain_page(_sl3mfn); \
- for ( _i = 0; _i < 64; _i++ ) \
- { \
- /* Every second 32-byte region is a bookkeeping entry */ \
- _sl3e = (shadow_l3e_t *)(_sp + (64 * _i)); \
- if ( (sl3p_to_info(_sl3e))->refcount > 0 ) \
- SHADOW2_FOREACH_L3E_SUB(_sl3e, _gl3p, \
- ({ __done = (_done); __done; }), \
- _code); \
- else \
- for ( _k = 0 ; _k < 4 ; _k++ ) \
- increment_ptr_to_guest_entry(_gl3p); \
- if ( __done ) break; \
- } \
- sh2_unmap_domain_page(_sp); \
- _sl3mfn = _mfn(mfn_x(_sl3mfn) + 1); \
- } \
-} while (0)
-
-#elif GUEST_PAGING_LEVELS == 4
-
-/* 64-bit l3: touch all entries */
-#define SHADOW2_FOREACH_L3E(_sl3mfn, _sl3e, _gl3p, _done, _code) \
-do { \
- int _i; \
- shadow_l3e_t *_sp = map_shadow_page((_sl3mfn)); \
- ASSERT((mfn_to_page(_sl3mfn)->count_info & PGC_SH2_type_mask) \
- == PGC_SH2_l3_64_shadow); \
- for ( _i = 0; _i < SHADOW_L3_PAGETABLE_ENTRIES; _i++ ) \
- { \
- (_sl3e) = _sp + _i; \
- if ( shadow_l3e_get_flags(*(_sl3e)) & _PAGE_PRESENT ) \
- {_code} \
- if ( _done ) break; \
- increment_ptr_to_guest_entry(_gl3p); \
- } \
- unmap_shadow_page(_sp); \
-} while (0)
-
-/* 64-bit l4: avoid Xen mappings */
-#define SHADOW2_FOREACH_L4E(_sl4mfn, _sl4e, _gl4p, _done, _xen, _code) \
-do { \
- int _i; \
- shadow_l4e_t *_sp = map_shadow_page((_sl4mfn)); \
- ASSERT((mfn_to_page(_sl4mfn)->count_info & PGC_SH2_type_mask) \
- == PGC_SH2_l4_64_shadow); \
- for ( _i = 0; _i < SHADOW_L4_PAGETABLE_ENTRIES; _i++ ) \
- { \
- if ( (!(_xen)) || is_guest_l4_slot(_i) ) \
- { \
- (_sl4e) = _sp + _i; \
- if ( shadow_l4e_get_flags(*(_sl4e)) & _PAGE_PRESENT ) \
- {_code} \
- if ( _done ) break; \
- } \
- increment_ptr_to_guest_entry(_gl4p); \
- } \
- unmap_shadow_page(_sp); \
-} while (0)
-
-#endif
-
-
-
-/**************************************************************************/
-/* Functions to install Xen mappings and linear mappings in shadow pages */
-
-static mfn_t sh2_make_shadow(struct vcpu *v, mfn_t gmfn, u32 shadow_type);
-
-// XXX -- this function should probably be moved to shadow2-common.c, but that
-// probably wants to wait until the shadow types have been moved from
-// shadow2-types.h to shadow2-private.h
-//
-#if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
-void sh2_install_xen_entries_in_l4(struct vcpu *v, mfn_t gl4mfn, mfn_t sl4mfn)
-{
- struct domain *d = v->domain;
- shadow_l4e_t *sl4e;
-
- sl4e = sh2_map_domain_page(sl4mfn);
- ASSERT(sl4e != NULL);
- ASSERT(sizeof (l4_pgentry_t) == sizeof (shadow_l4e_t));
-
- /* Copy the common Xen mappings from the idle domain */
- memcpy(&sl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
- &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
- ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
-
- /* Install the per-domain mappings for this domain */
- sl4e[shadow_l4_table_offset(PERDOMAIN_VIRT_START)] =
- shadow_l4e_from_mfn(page_to_mfn(virt_to_page(d->arch.mm_perdomain_l3)),
- __PAGE_HYPERVISOR);
-
- /* Linear mapping */
- sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
- shadow_l4e_from_mfn(gl4mfn, __PAGE_HYPERVISOR);
- sl4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
- shadow_l4e_from_mfn(sl4mfn, __PAGE_HYPERVISOR);
-
- if ( shadow2_mode_translate(v->domain) )
- {
- /* install domain-specific P2M table */
- sl4e[shadow_l4_table_offset(RO_MPT_VIRT_START)] =
- shadow_l4e_from_mfn(pagetable_get_mfn(d->arch.phys_table),
- __PAGE_HYPERVISOR);
- }
-
- sh2_unmap_domain_page(sl4e);
-}
-#endif
-
-#if CONFIG_PAGING_LEVELS == 3 && GUEST_PAGING_LEVELS == 3
-// For 3-on-3 PV guests, we need to make sure the xen mappings are in
-// place, which means that we need to populate the l2h entry in the l3
-// table.
-
-void sh2_install_xen_entries_in_l2h(struct vcpu *v,
- mfn_t sl2hmfn)
-{
- struct domain *d = v->domain;
- shadow_l2e_t *sl2e;
- int i;
-
- sl2e = sh2_map_domain_page(sl2hmfn);
- ASSERT(sl2e != NULL);
- ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t));
-
- /* Copy the common Xen mappings from the idle domain */
- memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
- &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
- L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
-
- /* Install the per-domain mappings for this domain */
- for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
- sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] =
- shadow_l2e_from_mfn(
- page_to_mfn(virt_to_page(d->arch.mm_perdomain_pt) + i),
- __PAGE_HYPERVISOR);
-
- /* We don't set up a linear mapping here because we can't until this
- * l2h is installed in an l3e. sh2_update_linear_entries() handles
- * the linear mappings when the l3 is loaded. */
-
- if ( shadow2_mode_translate(d) )
- {
- /* Install the domain-specific p2m table */
- l3_pgentry_t *p2m;
- ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0);
- p2m = sh2_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
- for ( i = 0; i < MACHPHYS_MBYTES>>1; i++ )
- {
- sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START) + i] =
- shadow_l2e_from_mfn(_mfn(l3e_get_pfn(p2m[i])),
- __PAGE_HYPERVISOR);
- }
- sh2_unmap_domain_page(p2m);
- }
-
- sh2_unmap_domain_page(sl2e);
-}
-
-void sh2_install_xen_entries_in_l3(struct vcpu *v, mfn_t gl3mfn, mfn_t sl3mfn)
-{
- shadow_l3e_t *sl3e;
- guest_l3e_t *gl3e = v->arch.guest_vtable;
- shadow_l3e_t new_sl3e;
- gfn_t l2gfn;
- mfn_t l2gmfn, l2smfn;
- int r;
-
- ASSERT(!shadow2_mode_external(v->domain));
- ASSERT(guest_l3e_get_flags(gl3e[3]) & _PAGE_PRESENT);
- l2gfn = guest_l3e_get_gfn(gl3e[3]);
- l2gmfn = sh2_gfn_to_mfn(v->domain, gfn_x(l2gfn));
- l2smfn = get_shadow_status(v, l2gmfn, PGC_SH2_l2h_shadow);
- if ( !valid_mfn(l2smfn) )
- {
- l2smfn = sh2_make_shadow(v, l2gmfn, PGC_SH2_l2h_shadow);
- }
- l3e_propagate_from_guest(v, &gl3e[3], gl3mfn, l2smfn, &new_sl3e,
- ft_prefetch);
- sl3e = sh2_map_domain_page(sl3mfn);
- r = shadow_set_l3e(v, &sl3e[3], new_sl3e, sl3mfn);
- sh2_unmap_domain_page(sl3e);
-}
-#endif
-
-
-#if CONFIG_PAGING_LEVELS == 2 && GUEST_PAGING_LEVELS == 2
-void sh2_install_xen_entries_in_l2(struct vcpu *v, mfn_t gl2mfn, mfn_t sl2mfn)
-{
- struct domain *d = v->domain;
- shadow_l2e_t *sl2e;
- int i;
-
- sl2e = sh2_map_domain_page(sl2mfn);
- ASSERT(sl2e != NULL);
- ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t));
-
- /* Copy the common Xen mappings from the idle domain */
- memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT],
- &idle_pg_table[L2_PAGETABLE_FIRST_XEN_SLOT],
- L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
-
- /* Install the per-domain mappings for this domain */
- for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
- sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] =
- shadow_l2e_from_mfn(
- page_to_mfn(virt_to_page(d->arch.mm_perdomain_pt) + i),
- __PAGE_HYPERVISOR);
-
- /* Linear mapping */
- sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START)] =
- shadow_l2e_from_mfn(gl2mfn, __PAGE_HYPERVISOR);
- sl2e[shadow_l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
- shadow_l2e_from_mfn(sl2mfn, __PAGE_HYPERVISOR);
-
- if ( shadow2_mode_translate(d) )
- {
- /* install domain-specific P2M table */
- sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START)] =
- shadow_l2e_from_mfn(pagetable_get_mfn(d->arch.phys_table),
- __PAGE_HYPERVISOR);
- }
-
- sh2_unmap_domain_page(sl2e);
-}
-#endif
-
-
-
-
-
-/**************************************************************************/
-/* Create a shadow of a given guest page.
- */
-static mfn_t
-sh2_make_shadow(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
-{
- mfn_t smfn = shadow2_alloc(v->domain, shadow_type, mfn_x(gmfn));
- SHADOW2_DEBUG(MAKE_SHADOW, "(%05lx, %u)=>%05lx\n",
- mfn_x(gmfn), shadow_type, mfn_x(smfn));
-
- if ( shadow_type != PGC_SH2_guest_root_type )
- /* Lower-level shadow, not yet linked form a higher level */
- mfn_to_page(smfn)->up = 0;
-
- // Create the Xen mappings...
- if ( !shadow2_mode_external(v->domain) )
- {
- switch (shadow_type)
- {
-#if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
- case PGC_SH2_l4_shadow:
- sh2_install_xen_entries_in_l4(v, gmfn, smfn); break;
-#endif
-#if CONFIG_PAGING_LEVELS == 3 && GUEST_PAGING_LEVELS == 3
- case PGC_SH2_l3_shadow:
- sh2_install_xen_entries_in_l3(v, gmfn, smfn); break;
- case PGC_SH2_l2h_shadow:
- sh2_install_xen_entries_in_l2h(v, smfn); break;
-#endif
-#if CONFIG_PAGING_LEVELS == 2 && GUEST_PAGING_LEVELS == 2
- case PGC_SH2_l2_shadow:
- sh2_install_xen_entries_in_l2(v, gmfn, smfn); break;
-#endif
- default: /* Do nothing */ break;
- }
- }
-
- shadow2_promote(v, gmfn, shadow_type);
- set_shadow2_status(v, gmfn, shadow_type, smfn);
-
- return smfn;
-}
-
-/* Make a splintered superpage shadow */
-static mfn_t
-make_fl1_shadow(struct vcpu *v, gfn_t gfn)
-{
- mfn_t smfn = shadow2_alloc(v->domain, PGC_SH2_fl1_shadow,
- (unsigned long) gfn_x(gfn));
-
- SHADOW2_DEBUG(MAKE_SHADOW, "(%" SH2_PRI_gfn ")=>%" SH2_PRI_mfn "\n",
- gfn_x(gfn), mfn_x(smfn));
-
- set_fl1_shadow_status(v, gfn, smfn);
- return smfn;
-}
-
-
-#if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
-mfn_t
-sh2_make_monitor_table(struct vcpu *v)
-{
-
- ASSERT(pagetable_get_pfn(v->arch.monitor_table) == 0);
-
-#if CONFIG_PAGING_LEVELS == 4
- {
- struct domain *d = v->domain;
- mfn_t m4mfn;
- m4mfn = shadow2_alloc(d, PGC_SH2_monitor_table, 0);
- sh2_install_xen_entries_in_l4(v, m4mfn, m4mfn);
- /* Remember the level of this table */
- mfn_to_page(m4mfn)->shadow2_flags = 4;
-#if SHADOW_PAGING_LEVELS < 4
- // Install a monitor l3 table in slot 0 of the l4 table.
- // This is used for shadow linear maps.
- {
- mfn_t m3mfn;
- l4_pgentry_t *l4e;
- m3mfn = shadow2_alloc(d, PGC_SH2_monitor_table, 0);
- mfn_to_page(m3mfn)->shadow2_flags = 3;
- l4e = sh2_map_domain_page(m4mfn);
- l4e[0] = l4e_from_pfn(mfn_x(m3mfn), __PAGE_HYPERVISOR);
- sh2_unmap_domain_page(l4e);
- }
-#endif /* SHADOW_PAGING_LEVELS < 4 */
- return m4mfn;
- }
-
-#elif CONFIG_PAGING_LEVELS == 3
-
- {
- struct domain *d = v->domain;
- mfn_t m3mfn, m2mfn;
- l3_pgentry_t *l3e;
- l2_pgentry_t *l2e;
- int i;
-
- m3mfn = shadow2_alloc(d, PGC_SH2_monitor_table, 0);
- /* Remember the level of this table */
- mfn_to_page(m3mfn)->shadow2_flags = 3;
-
- // Install a monitor l2 table in slot 3 of the l3 table.
- // This is used for all Xen entries, including linear maps
- m2mfn = shadow2_alloc(d, PGC_SH2_monitor_table, 0);
- mfn_to_page(m2mfn)->shadow2_flags = 2;
- l3e = sh2_map_domain_page(m3mfn);
- l3e[3] = l3e_from_pfn(mfn_x(m2mfn), _PAGE_PRESENT);
- sh2_install_xen_entries_in_l2h(v, m2mfn);
- /* Install the monitor's own linear map */
- l2e = sh2_map_domain_page(m2mfn);
- for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
- l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] =
- (l3e_get_flags(l3e[i]) & _PAGE_PRESENT)
- ? l2e_from_pfn(l3e_get_pfn(l3e[i]), __PAGE_HYPERVISOR)
- : l2e_empty();
- sh2_unmap_domain_page(l2e);
- sh2_unmap_domain_page(l3e);
-
- SHADOW2_PRINTK("new monitor table: %#lx\n", mfn_x(m3mfn));
- return m3mfn;
- }
-
-#elif CONFIG_PAGING_LEVELS == 2
-
- {
- struct domain *d = v->domain;
- mfn_t m2mfn;
- m2mfn = shadow2_alloc(d, PGC_SH2_monitor_table, 0);
- sh2_install_xen_entries_in_l2(v, m2mfn, m2mfn);
- /* Remember the level of this table */
- mfn_to_page(m2mfn)->shadow2_flags = 2;
- return m2mfn;
- }
-
-#else
-#error this should not happen
-#endif /* CONFIG_PAGING_LEVELS */
-}
-#endif /* SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS */
-
-/**************************************************************************/
-/* These functions also take a virtual address and return the level-N
- * shadow table mfn and entry, but they create the shadow pagetables if
- * they are needed. The "demand" argument is non-zero when handling
- * a demand fault (so we know what to do about accessed bits &c).
- * If the necessary tables are not present in the guest, they return NULL. */
-#if GUEST_PAGING_LEVELS >= 4
-static shadow_l4e_t * shadow_get_and_create_l4e(struct vcpu *v,
- walk_t *gw,
- mfn_t *sl4mfn)
-{
- /* There is always a shadow of the top level table. Get it. */
- *sl4mfn = pagetable_get_mfn(v->arch.shadow_table);
- /* Reading the top level table is always valid. */
- return sh2_linear_l4_table(v) + shadow_l4_linear_offset(gw->va);
-}
-#endif /* GUEST_PAGING_LEVELS >= 4 */
-
-
-#if GUEST_PAGING_LEVELS >= 3
-static shadow_l3e_t * shadow_get_and_create_l3e(struct vcpu *v,
- walk_t *gw,
- mfn_t *sl3mfn,
- fetch_type_t ft)
-{
-#if GUEST_PAGING_LEVELS >= 4 /* 64bit... */
- mfn_t sl4mfn;
- shadow_l4e_t *sl4e;
- if ( !valid_mfn(gw->l3mfn) ) return NULL; /* No guest page. */
- /* Get the l4e */
- sl4e = shadow_get_and_create_l4e(v, gw, &sl4mfn);
- ASSERT(sl4e != NULL);
- if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT )
- {
- *sl3mfn = shadow_l4e_get_mfn(*sl4e);
- ASSERT(valid_mfn(*sl3mfn));
- }
- else
- {
- int r;
- shadow_l4e_t new_sl4e;
- /* No l3 shadow installed: find and install it. */
- *sl3mfn = get_shadow_status(v, gw->l3mfn, PGC_SH2_l3_shadow);
- if ( !valid_mfn(*sl3mfn) )
- {
- /* No l3 shadow of this page exists at all: make one. */
- *sl3mfn = sh2_make_shadow(v, gw->l3mfn, PGC_SH2_l3_shadow);
- }
- /* Install the new sl3 table in the sl4e */
- l4e_propagate_from_guest(v, gw->l4e, gw->l4mfn,
- *sl3mfn, &new_sl4e, ft);
- r = shadow_set_l4e(v, sl4e, new_sl4e, sl4mfn);
- ASSERT((r & SHADOW2_SET_FLUSH) == 0);
- }
- /* Now follow it down a level. Guaranteed to succeed. */
- return sh2_linear_l3_table(v) + shadow_l3_linear_offset(gw->va);
-#else /* PAE... */
- /* There is always a shadow of the top level table. Get it. */
- *sl3mfn = pagetable_get_mfn(v->arch.shadow_table);
- /* This next line is important: the shadow l3 table is in an 8k
- * shadow and we need to return the right mfn of the pair. This call
- * will set it for us as a side-effect. */
- (void) shadow_l3_index(sl3mfn, guest_index(gw->l3e));
- ASSERT(v->arch.shadow_vtable);
- return ((shadow_l3e_t *)v->arch.shadow_vtable)
- + shadow_l3_table_offset(gw->va);
-#endif /* GUEST_PAGING_LEVELS >= 4 */
-}
-#endif /* GUEST_PAGING_LEVELS >= 3 */
-
-
-static shadow_l2e_t * shadow_get_and_create_l2e(struct vcpu *v,
- walk_t *gw,
- mfn_t *sl2mfn,
- fetch_type_t ft)
-{
-#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64bit... */
- mfn_t sl3mfn = _mfn(INVALID_MFN);
- shadow_l3e_t *sl3e;
- if ( !valid_mfn(gw->l2mfn) ) return NULL; /* No guest page. */
- /* Get the l3e */
- sl3e = shadow_get_and_create_l3e(v, gw, &sl3mfn, ft);
- ASSERT(sl3e != NULL); /* Since we know guest PT is valid this far */
- if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT )
- {
- *sl2mfn = shadow_l3e_get_mfn(*sl3e);
- ASSERT(valid_mfn(*sl2mfn));
- }
- else
- {
- int r;
- shadow_l3e_t new_sl3e;
- /* No l2 shadow installed: find and install it. */
- *sl2mfn = get_shadow_status(v, gw->l2mfn, PGC_SH2_l2_shadow);
- if ( !valid_mfn(*sl2mfn) )
- {
- /* No l2 shadow of this page exists at all: make one. */
- *sl2mfn = sh2_make_shadow(v, gw->l2mfn, PGC_SH2_l2_shadow);
- }
- /* Install the new sl2 table in the sl3e */
- l3e_propagate_from_guest(v, gw->l3e, gw->l3mfn,
- *sl2mfn, &new_sl3e, ft);
- r = shadow_set_l3e(v, sl3e, new_sl3e, sl3mfn);
- ASSERT((r & SHADOW2_SET_FLUSH) == 0);
-#if GUEST_PAGING_LEVELS == 3
- /* Need to sync up the linear maps, as we are about to use them */
- ASSERT( r & SHADOW2_SET_L3PAE_RECOPY );
- sh2_pae_recopy(v->domain);
-#endif
- }
- /* Now follow it down a level. Guaranteed to succeed. */
- return sh2_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
-#else /* 32bit... */
- /* There is always a shadow of the top level table. Get it. */
- *sl2mfn = pagetable_get_mfn(v->arch.shadow_table);
- /* This next line is important: the guest l2 has a 16k
- * shadow, we need to return the right mfn of the four. This
- * call will set it for us as a side-effect. */
- (void) shadow_l2_index(sl2mfn, guest_index(gw->l2e));
- /* Reading the top level table is always valid. */
- return sh2_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
-#endif
-}
-
-
-static shadow_l1e_t * shadow_get_and_create_l1e(struct vcpu *v,
- walk_t *gw,
- mfn_t *sl1mfn,
- fetch_type_t ft)
-{
- mfn_t sl2mfn;
- shadow_l2e_t *sl2e;
-
- /* Get the l2e */
- sl2e = shadow_get_and_create_l2e(v, gw, &sl2mfn, ft);
- if ( sl2e == NULL ) return NULL;
- if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT )
- {
- *sl1mfn = shadow_l2e_get_mfn(*sl2e);
- ASSERT(valid_mfn(*sl1mfn));
- }
- else
- {
- shadow_l2e_t new_sl2e;
- int r, flags = guest_l2e_get_flags(*gw->l2e);
- /* No l1 shadow installed: find and install it. */
- if ( !(flags & _PAGE_PRESENT) )
- return NULL; /* No guest page. */
- if ( guest_supports_superpages(v) && (flags & _PAGE_PSE) )
- {
- /* Splintering a superpage */
- gfn_t l2gfn = guest_l2e_get_gfn(*gw->l2e);
- *sl1mfn = get_fl1_shadow_status(v, l2gfn);
- if ( !valid_mfn(*sl1mfn) )
- {
- /* No fl1 shadow of this superpage exists at all: make one. */
- *sl1mfn = make_fl1_shadow(v, l2gfn);
- }
- }
- else
- {
- /* Shadowing an actual guest l1 table */
- if ( !valid_mfn(gw->l2mfn) ) return NULL; /* No guest page. */
- *sl1mfn = get_shadow_status(v, gw->l1mfn, PGC_SH2_l1_shadow);
- if ( !valid_mfn(*sl1mfn) )
- {
- /* No l1 shadow of this page exists at all: make one. */
- *sl1mfn = sh2_make_shadow(v, gw->l1mfn, PGC_SH2_l1_shadow);
- }
- }
- /* Install the new sl1 table in the sl2e */
- l2e_propagate_from_guest(v, gw->l2e, gw->l2mfn,
- *sl1mfn, &new_sl2e, ft);
- r = shadow_set_l2e(v, sl2e, new_sl2e, sl2mfn);
- ASSERT((r & SHADOW2_SET_FLUSH) == 0);
- /* This next line is important: in 32-on-PAE and 32-on-64 modes,
- * the guest l1 table has an 8k shadow, and we need to return
- * the right mfn of the pair. This call will set it for us as a
- * side-effect. (In all other cases, it's a no-op and will be
- * compiled out.) */
- (void) shadow_l1_index(sl1mfn, guest_l1_table_offset(gw->va));
- }
- /* Now follow it down a level. Guaranteed to succeed. */
- return sh2_linear_l1_table(v) + shadow_l1_linear_offset(gw->va);
-}
-
-
-
-/**************************************************************************/
-/* Destructors for shadow tables:
- * Unregister the shadow, decrement refcounts of any entries present in it,
- * and release the memory.
- *
- * N.B. These destructors do not clear the contents of the shadows.
- * This allows us to delay TLB shootdowns until the page is being reused.
- * See shadow2_alloc() and shadow2_free() for how this is handled.
- */
-
-#if GUEST_PAGING_LEVELS >= 4
-void sh2_destroy_l4_shadow(struct vcpu *v, mfn_t smfn)
-{
- shadow_l4e_t *sl4e;
- u32 t = mfn_to_page(smfn)->count_info & PGC_SH2_type_mask;
- mfn_t gmfn, sl4mfn;
- int xen_mappings;
-
- SHADOW2_DEBUG(DESTROY_SHADOW,
- "%s(%05lx)\n", __func__, mfn_x(smfn));
- ASSERT(t == PGC_SH2_l4_shadow);
-
- /* Record that the guest page isn't shadowed any more (in this type) */
- gmfn = _mfn(mfn_to_page(smfn)->u.inuse.type_info);
- delete_shadow2_status(v, gmfn, t, smfn);
- shadow2_demote(v, gmfn, t);
- /* Take this shadow off the list of root shadows */
- list_del_init(&mfn_to_page(smfn)->list);
-
- /* Decrement refcounts of all the old entries */
- xen_mappings = (!shadow2_mode_external(v->domain));
- sl4mfn = smfn;
- SHADOW2_FOREACH_L4E(sl4mfn, sl4e, 0, 0, xen_mappings, {
- if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT )
- {
- sh2_put_ref(v, shadow_l4e_get_mfn(*sl4e),
- (((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT)
- | ((unsigned long)sl4e & ~PAGE_MASK));
- }
- });
-
- /* Put the memory back in the pool */
- shadow2_free(v->domain, smfn);
-}
-#endif
-
-#if GUEST_PAGING_LEVELS >= 3
-void sh2_destroy_l3_shadow(struct vcpu *v, mfn_t smfn)
-{
- shadow_l3e_t *sl3e;
- u32 t = mfn_to_page(smfn)->count_info & PGC_SH2_type_mask;
- mfn_t gmfn, sl3mfn;
-
- SHADOW2_DEBUG(DESTROY_SHADOW,
- "%s(%05lx)\n", __func__, mfn_x(smfn));
- ASSERT(t == PGC_SH2_l3_shadow);
-
- /* Record that the guest page isn't shadowed any more (in this type) */
- gmfn = _mfn(mfn_to_page(smfn)->u.inuse.type_info);
- delete_shadow2_status(v, gmfn, t, smfn);
- shadow2_demote(v, gmfn, t);
-#if GUEST_PAGING_LEVELS == 3
- /* Take this shadow off the list of root shadows */
- list_del_init(&mfn_to_page(smfn)->list);
-#endif
-
- /* Decrement refcounts of all the old entries */
- sl3mfn = smfn;
- SHADOW2_FOREACH_L3E(sl3mfn, sl3e, 0, 0, {
- if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT )
- sh2_put_ref(v, shadow_l3e_get_mfn(*sl3e),
- (((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT)
- | ((unsigned long)sl3e & ~PAGE_MASK));
- });
-
- /* Put the memory back in the pool */
- shadow2_free(v->domain, smfn);
-}
-#endif
-
-
-#if GUEST_PAGING_LEVELS == 3
-static void sh2_destroy_l3_subshadow(struct vcpu *v,
- shadow_l3e_t *sl3e)
-/* Tear down just a single 4-entry l3 on a 2-page l3 shadow. */
-{
- int i;
- ASSERT((unsigned long)sl3e % (4 * sizeof (shadow_l3e_t)) == 0);
- for ( i = 0; i < GUEST_L3_PAGETABLE_ENTRIES; i++ )
- if ( shadow_l3e_get_flags(sl3e[i]) & _PAGE_PRESENT )
- sh2_put_ref(v, shadow_l3e_get_mfn(sl3e[i]),
- maddr_from_mapped_domain_page(sl3e));
-}
-#endif
-
-#if (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3)
-void sh2_unpin_all_l3_subshadows(struct vcpu *v, mfn_t smfn)
-/* Walk a full PAE l3 shadow, unpinning all of the subshadows on it */
-{
- int i, j;
- struct pae_l3_bookkeeping *bk;
-
- ASSERT((mfn_to_page(smfn)->count_info & PGC_SH2_type_mask)
- == PGC_SH2_l3_pae_shadow);
- /* The subshadows are split, 64 on each page of the shadow */
- for ( i = 0; i < 2; i++ )
- {
- void *p = sh2_map_domain_page(_mfn(mfn_x(smfn) + i));
- for ( j = 0; j < 64; j++ )
- {
- /* Every second 32-byte region is a bookkeeping entry */
- bk = (struct pae_l3_bookkeeping *)(p + (64 * j) + 32);
- if ( bk->pinned )
- sh2_unpin_l3_subshadow(v, (shadow_l3e_t *)(p + (64*j)), smfn);
- /* Check whether we've just freed the whole shadow */
- if ( (mfn_to_page(smfn)->count_info & PGC_SH2_count_mask) == 0 )
- {
- sh2_unmap_domain_page(p);
- return;
- }
- }
- sh2_unmap_domain_page(p);
- }
-}
-#endif
-
-void sh2_destroy_l2_shadow(struct vcpu *v, mfn_t smfn)
-{
- shadow_l2e_t *sl2e;
- u32 t = mfn_to_page(smfn)->count_info & PGC_SH2_type_mask;
- mfn_t gmfn, sl2mfn;
- int xen_mappings;
-
- SHADOW2_DEBUG(DESTROY_SHADOW,
- "%s(%05lx)\n", __func__, mfn_x(smfn));
- ASSERT(t == PGC_SH2_l2_shadow
- || t == PGC_SH2_l2h_pae_shadow);
-
- /* Record that the guest page isn't shadowed any more (in this type) */
- gmfn = _mfn(mfn_to_page(smfn)->u.inuse.type_info);
- delete_shadow2_status(v, gmfn, t, smfn);
- shadow2_demote(v, gmfn, t);
-#if GUEST_PAGING_LEVELS == 2
- /* Take this shadow off the list of root shadows */
- list_del_init(&mfn_to_page(smfn)->list);
-#endif
-
- /* Decrement refcounts of all the old entries */
- sl2mfn = smfn;
- xen_mappings = (!shadow2_mode_external(v->domain) &&
- ((GUEST_PAGING_LEVELS == 2) ||
- ((GUEST_PAGING_LEVELS == 3) &&
- (t == PGC_SH2_l2h_pae_shadow))));
- SHADOW2_FOREACH_L2E(sl2mfn, sl2e, 0, 0, xen_mappings, {
- if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT )
- sh2_put_ref(v, shadow_l2e_get_mfn(*sl2e),
- (((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
- | ((unsigned long)sl2e & ~PAGE_MASK));
- });
-
- /* Put the memory back in the pool */
- shadow2_free(v->domain, smfn);
-}
-
-void sh2_destroy_l1_shadow(struct vcpu *v, mfn_t smfn)
-{
- struct domain *d = v->domain;
- shadow_l1e_t *sl1e;
- u32 t = mfn_to_page(smfn)->count_info & PGC_SH2_type_mask;
-
- SHADOW2_DEBUG(DESTROY_SHADOW,
- "%s(%05lx)\n", __func__, mfn_x(smfn));
- ASSERT(t == PGC_SH2_l1_shadow || t == PGC_SH2_fl1_shadow);
-
- /* Record that the guest page isn't shadowed any more (in this type) */
- if ( t == PGC_SH2_fl1_shadow )
- {
- gfn_t gfn = _gfn(mfn_to_page(smfn)->u.inuse.type_info);
- delete_fl1_shadow_status(v, gfn, smfn);
- }
- else
- {
- mfn_t gmfn = _mfn(mfn_to_page(smfn)->u.inuse.type_info);
- delete_shadow2_status(v, gmfn, t, smfn);
- shadow2_demote(v, gmfn, t);
- }
-
- if ( shadow2_mode_refcounts(d) )
- {
- /* Decrement refcounts of all the old entries */
- mfn_t sl1mfn = smfn;
- SHADOW2_FOREACH_L1E(sl1mfn, sl1e, 0, 0, {
- if ( shadow_l1e_get_flags(*sl1e) & _PAGE_PRESENT )
- shadow2_put_page_from_l1e(*sl1e, d);
- });
- }
-
- /* Put the memory back in the pool */
- shadow2_free(v->domain, smfn);
-}
-
-#if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
-void sh2_destroy_monitor_table(struct vcpu *v, mfn_t mmfn)
-{
- struct domain *d = v->domain;
- ASSERT((mfn_to_page(mmfn)->count_info & PGC_SH2_type_mask)
- == PGC_SH2_monitor_table);
-
-#if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS != 4)
- /* Need to destroy the l3 monitor page in slot 0 too */
- {
- l4_pgentry_t *l4e = sh2_map_domain_page(mmfn);
- ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT);
- shadow2_free(d, _mfn(l4e_get_pfn(l4e[0])));
- sh2_unmap_domain_page(l4e);
- }
-#elif CONFIG_PAGING_LEVELS == 3
- /* Need to destroy the l2 monitor page in slot 4 too */
- {
- l3_pgentry_t *l3e = sh2_map_domain_page(mmfn);
- ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
- shadow2_free(d, _mfn(l3e_get_pfn(l3e[3])));
- sh2_unmap_domain_page(l3e);
- }
-#endif
-
- /* Put the memory back in the pool */
- shadow2_free(d, mmfn);
-}
-#endif
-
-/**************************************************************************/
-/* Functions to destroy non-Xen mappings in a pagetable hierarchy.
- * These are called from common code when we are running out of shadow
- * memory, and unpinning all the top-level shadows hasn't worked.
- *
- * This implementation is pretty crude and slow, but we hope that it won't
- * be called very often. */
-
-#if GUEST_PAGING_LEVELS == 2
-
-void sh2_unhook_32b_mappings(struct vcpu *v, mfn_t sl2mfn)
-{
- shadow_l2e_t *sl2e;
- int xen_mappings = !shadow2_mode_external(v->domain);
- SHADOW2_FOREACH_L2E(sl2mfn, sl2e, 0, 0, xen_mappings, {
- (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
- });
-}
-
-#elif GUEST_PAGING_LEVELS == 3
-
-void sh2_unhook_pae_mappings(struct vcpu *v, mfn_t sl3mfn)
-/* Walk a full PAE l3 shadow, unhooking entries from all the subshadows */
-{
- shadow_l3e_t *sl3e;
- SHADOW2_FOREACH_L3E(sl3mfn, sl3e, 0, 0, {
- if ( (shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT) ) {
- mfn_t sl2mfn = shadow_l3e_get_mfn(*sl3e);
- if ( (mfn_to_page(sl2mfn)->count_info & PGC_SH2_type_mask)
- == PGC_SH2_l2h_pae_shadow )
- {
- /* High l2: need to pick particular l2es to unhook */
- shadow_l2e_t *sl2e;
- SHADOW2_FOREACH_L2E(sl2mfn, sl2e, 0, 0, 1, {
- (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
- });
- }
- else
- {
- /* Normal l2: can safely unhook the whole l3e */
- (void) shadow_set_l3e(v, sl3e, shadow_l3e_empty(), sl3mfn);
- }
- }
- });
- /* We've changed PAE L3 entries: must sync up various copies of them */
- sh2_pae_recopy(v->domain);
-}
-
-#elif GUEST_PAGING_LEVELS == 4
-
-void sh2_unhook_64b_mappings(struct vcpu *v, mfn_t sl4mfn)
-{
- shadow_l4e_t *sl4e;
- int xen_mappings = !shadow2_mode_external(v->domain);
- SHADOW2_FOREACH_L4E(sl4mfn, sl4e, 0, 0, xen_mappings, {
- (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
- });
-}
-
-#endif
-
-/**************************************************************************/
-/* Internal translation functions.
- * These functions require a pointer to the shadow entry that will be updated.
- */
-
-/* These functions take a new guest entry, translate it to shadow and write
- * the shadow entry.
- *
- * They return the same bitmaps as the shadow_set_lXe() functions.
- */
-
-#if GUEST_PAGING_LEVELS >= 4
-static int validate_gl4e(struct vcpu *v, void *new_ge, mfn_t sl4mfn, void *se)
-{
- shadow_l4e_t new_sl4e;
- guest_l4e_t *new_gl4e = new_ge;
- shadow_l4e_t *sl4p = se;
- mfn_t sl3mfn = _mfn(INVALID_MFN);
- int result = 0;
-
- perfc_incrc(shadow2_validate_gl4e_calls);
-
- if ( guest_l4e_get_flags(*new_gl4e) & _PAGE_PRESENT )
- {
- gfn_t gl3gfn = guest_l4e_get_gfn(*new_gl4e);
- mfn_t gl3mfn = vcpu_gfn_to_mfn(v, gl3gfn);
- if ( valid_mfn(gl3mfn) )
- sl3mfn = get_shadow_status(v, gl3mfn, PGC_SH2_l3_shadow);
- else
- result |= SHADOW2_SET_ERROR;
- }
- l4e_propagate_from_guest(v, new_gl4e, _mfn(INVALID_MFN),
- sl3mfn, &new_sl4e, ft_prefetch);
- result |= shadow_set_l4e(v, sl4p, new_sl4e, sl4mfn);
- return result;
-}
-#endif // GUEST_PAGING_LEVELS >= 4
-
-#if GUEST_PAGING_LEVELS >= 3
-static int validate_gl3e(struct vcpu *v, void *new_ge, mfn_t sl3mfn, void *se)
-{
- shadow_l3e_t new_sl3e;
- guest_l3e_t *new_gl3e = new_ge;
- shadow_l3e_t *sl3p = se;
- mfn_t sl2mfn = _mfn(INVALID_MFN);
- int result = 0;
-
- perfc_incrc(shadow2_validate_gl3e_calls);
-
- if ( guest_l3e_get_flags(*new_gl3e) & _PAGE_PRESENT )
- {
- gfn_t gl2gfn = guest_l3e_get_gfn(*new_gl3e);
- mfn_t gl2mfn = vcpu_gfn_to_mfn(v, gl2gfn);
- if ( valid_mfn(gl2mfn) )
- sl2mfn = get_shadow_status(v, gl2mfn, PGC_SH2_l2_shadow);
- else
- result |= SHADOW2_SET_ERROR;
- }
- l3e_propagate_from_guest(v, new_gl3e, _mfn(INVALID_MFN),
- sl2mfn, &new_sl3e, ft_prefetch);
- result |= shadow_set_l3e(v, sl3p, new_sl3e, sl3mfn);
-
-#if GUEST_PAGING_LEVELS == 3
- /* We have changed a PAE l3 entry: need to sync up the possible copies
- * of it */
- if ( result & SHADOW2_SET_L3PAE_RECOPY )
- sh2_pae_recopy(v->domain);
-#endif
-
- return result;
-}
-#endif // GUEST_PAGING_LEVELS >= 3
-
-static int validate_gl2e(struct vcpu *v, void *new_ge, mfn_t sl2mfn, void *se)
-{
- shadow_l2e_t new_sl2e;
- guest_l2e_t *new_gl2e = new_ge;
- shadow_l2e_t *sl2p = se;
- mfn_t sl1mfn = _mfn(INVALID_MFN);
- int result = 0;
-
- perfc_incrc(shadow2_validate_gl2e_calls);
-
- if ( guest_l2e_get_flags(*new_gl2e) & _PAGE_PRESENT )
- {
- gfn_t gl1gfn = guest_l2e_get_gfn(*new_gl2e);
- if ( guest_supports_superpages(v) &&
- (guest_l2e_get_flags(*new_gl2e) & _PAGE_PSE) )
- {
- // superpage -- need to look up the shadow L1 which holds the
- // splitters...
- sl1mfn = get_fl1_shadow_status(v, gl1gfn);
-#if 0
- // XXX - it's possible that we want to do some kind of prefetch
- // for superpage fl1's here, but this is *not* on the demand path,
- // so we'll hold off trying that for now...
- //
- if ( !valid_mfn(sl1mfn) )
- sl1mfn = make_fl1_shadow(v, gl1gfn);
-#endif
- }
- else
- {
- mfn_t gl1mfn = vcpu_gfn_to_mfn(v, gl1gfn);
- if ( valid_mfn(gl1mfn) )
- sl1mfn = get_shadow_status(v, gl1mfn, PGC_SH2_l1_shadow);
- else
- result |= SHADOW2_SET_ERROR;
- }
- }
- l2e_propagate_from_guest(v, new_gl2e, _mfn(INVALID_MFN),
- sl1mfn, &new_sl2e, ft_prefetch);
- result |= shadow_set_l2e(v, sl2p, new_sl2e, sl2mfn);
-
- return result;
-}
-
-static int validate_gl1e(struct vcpu *v, void *new_ge, mfn_t sl1mfn, void *se)
-{
- shadow_l1e_t new_sl1e;
- guest_l1e_t *new_gl1e = new_ge;
- shadow_l1e_t *sl1p = se;
- gfn_t gfn;
- mfn_t mfn;
- int result = 0;
-
- perfc_incrc(shadow2_validate_gl1e_calls);
-
- gfn = guest_l1e_get_gfn(*new_gl1e);
- mfn = vcpu_gfn_to_mfn(v, gfn);
-
- l1e_propagate_from_guest(v, *new_gl1e, &new_sl1e,
- /* mmio? */ !valid_mfn(mfn));
-
- result |= shadow_set_l1e(v, sl1p, new_sl1e, sl1mfn);
- return result;
-}
-
-
-/**************************************************************************/
-/* Functions which translate and install a the shadows of arbitrary guest
- * entries that we have just seen the guest write. */
-
-
-static inline int
-sh2_map_and_validate(struct vcpu *v, mfn_t gmfn,
- void *new_gp, u32 size, u32 sh_type,
- u32 (*shadow_index)(mfn_t *smfn, u32 idx),
- int (*validate_ge)(struct vcpu *v, void *ge,
- mfn_t smfn, void *se))
-/* Generic function for mapping and validating. */
-{
- mfn_t smfn, smfn2, map_mfn;
- shadow_l1e_t *sl1p;
- u32 shadow_idx, guest_idx;
- int result = 0;
-
- /* Align address and size to guest entry boundaries */
- size += (unsigned long)new_gp & (sizeof (guest_l1e_t) - 1);
- new_gp = (void *)((unsigned long)new_gp & ~(sizeof (guest_l1e_t) - 1));
- size = (size + sizeof (guest_l1e_t) - 1) & ~(sizeof (guest_l1e_t) - 1);
- ASSERT(size + (((unsigned long)new_gp) & ~PAGE_MASK) <= PAGE_SIZE);
-
- /* Map the shadow page */
- smfn = get_shadow_status(v, gmfn, sh_type);
- ASSERT(valid_mfn(smfn)); /* Otherwise we would not have been called */
- guest_idx = guest_index(new_gp);
- map_mfn = smfn;
- shadow_idx = shadow_index(&map_mfn, guest_idx);
- sl1p = map_shadow_page(map_mfn);
-
- /* Validate one entry at a time */
- while ( size )
- {
- smfn2 = smfn;
- guest_idx = guest_index(new_gp);
- shadow_idx = shadow_index(&smfn2, guest_idx);
- if ( mfn_x(smfn2) != mfn_x(map_mfn) )
- {
- /* We have moved to another page of the shadow */
- map_mfn = smfn2;
- unmap_shadow_page(sl1p);
- sl1p = map_shadow_page(map_mfn);
- }
- result |= validate_ge(v,
- new_gp,
- map_mfn,
- &sl1p[shadow_idx]);
- size -= sizeof(guest_l1e_t);
- new_gp += sizeof(guest_l1e_t);
- }
- unmap_shadow_page(sl1p);
- return result;
-}
-
-
-int
-sh2_map_and_validate_gl4e(struct vcpu *v, mfn_t gl4mfn,
- void *new_gl4p, u32 size)
-{
-#if GUEST_PAGING_LEVELS >= 4
- return sh2_map_and_validate(v, gl4mfn, new_gl4p, size,
- PGC_SH2_l4_shadow,
- shadow_l4_index,
- validate_gl4e);
-#else // ! GUEST_PAGING_LEVELS >= 4
- SHADOW2_PRINTK("called in wrong paging mode!\n");
- BUG();
- return 0;
-#endif
-}
-
-int
-sh2_map_and_validate_gl3e(struct vcpu *v, mfn_t gl3mfn,
- void *new_gl3p, u32 size)
-{
-#if GUEST_PAGING_LEVELS >= 3
- return sh2_map_and_validate(v, gl3mfn, new_gl3p, size,
- PGC_SH2_l3_shadow,
- shadow_l3_index,
- validate_gl3e);
-#else // ! GUEST_PAGING_LEVELS >= 3
- SHADOW2_PRINTK("called in wrong paging mode!\n");
- BUG();
- return 0;
-#endif
-}
-
-int
-sh2_map_and_validate_gl2e(struct vcpu *v, mfn_t gl2mfn,
- void *new_gl2p, u32 size)
-{
- return sh2_map_and_validate(v, gl2mfn, new_gl2p, size,
- PGC_SH2_l2_shadow,
- shadow_l2_index,
- validate_gl2e);
-}
-
-int
-sh2_map_and_validate_gl2he(struct vcpu *v, mfn_t gl2mfn,
- void *new_gl2p, u32 size)
-{
-#if GUEST_PAGING_LEVELS == 3
- return sh2_map_and_validate(v, gl2mfn, new_gl2p, size,
- PGC_SH2_l2h_shadow,
- shadow_l2_index,
- validate_gl2e);
-#else /* Non-PAE guests don't have different kinds of l2 table */
- SHADOW2_PRINTK("called in wrong paging mode!\n");
- BUG();
- return 0;
-#endif
-}
-
-int
-sh2_map_and_validate_gl1e(struct vcpu *v, mfn_t gl1mfn,
- void *new_gl1p, u32 size)
-{
- return sh2_map_and_validate(v, gl1mfn, new_gl1p, size,
- PGC_SH2_l1_shadow,
- shadow_l1_index,
- validate_gl1e);
-}
-
-
-/**************************************************************************/
-/* Optimization: If we see two emulated writes of zeros to the same
- * page-table without another kind of page fault in between, we guess
- * that this is a batch of changes (for process destruction) and
- * unshadow the page so we don't take a pagefault on every entry. This
- * should also make finding writeable mappings of pagetables much
- * easier. */
-
-/* Look to see if this is the second emulated write in a row to this
- * page, and unshadow/unhook if it is */
-static inline void check_for_early_unshadow(struct vcpu *v, mfn_t gmfn)
-{
-#if SHADOW2_OPTIMIZATIONS & SH2OPT_EARLY_UNSHADOW
- if ( v->arch.shadow2.last_emulated_mfn == mfn_x(gmfn) &&
- sh2_mfn_is_a_page_table(gmfn) )
- {
- u32 flags = mfn_to_page(gmfn)->shadow2_flags;
- mfn_t smfn;
- if ( !(flags & (SH2F_L2_32|SH2F_L3_PAE|SH2F_L4_64)) )
- {
- perfc_incrc(shadow2_early_unshadow);
- sh2_remove_shadows(v, gmfn, 0 /* Can fail to unshadow */ );
- return;
- }
- /* SH2F_unhooked_mappings is set to make sure we only unhook
- * once in a single batch of updates. It is reset when this
- * top-level page is loaded into CR3 again */
- if ( !(flags & SH2F_unhooked_mappings) )
- {
- perfc_incrc(shadow2_early_unshadow_top);
- mfn_to_page(gmfn)->shadow2_flags |= SH2F_unhooked_mappings;
- if ( flags & SH2F_L2_32 )
- {
- smfn = get_shadow_status(v, gmfn, PGC_SH2_l2_32_shadow);
- shadow2_unhook_mappings(v, smfn);
- }
- if ( flags & SH2F_L3_PAE )
- {
- smfn = get_shadow_status(v, gmfn, PGC_SH2_l3_pae_shadow);
- shadow2_unhook_mappings(v, smfn);
- }
- if ( flags & SH2F_L4_64 )
- {
- smfn = get_shadow_status(v, gmfn, PGC_SH2_l4_64_shadow);
- shadow2_unhook_mappings(v, smfn);
- }
- }
- }
- v->arch.shadow2.last_emulated_mfn = mfn_x(gmfn);
-#endif
-}
-
-/* Stop counting towards early unshadows, as we've seen a real page fault */
-static inline void reset_early_unshadow(struct vcpu *v)
-{
-#if SHADOW2_OPTIMIZATIONS & SH2OPT_EARLY_UNSHADOW
- v->arch.shadow2.last_emulated_mfn = INVALID_MFN;
-#endif
-}
-
-
-
-/**************************************************************************/
-/* Entry points into the shadow code */
-
-/* Called from pagefault handler in Xen, and from the HVM trap handlers
- * for pagefaults. Returns 1 if this fault was an artefact of the
- * shadow code (and the guest should retry) or 0 if it is not (and the
- * fault should be handled elsewhere or passed to the guest). */
-
-static int sh2_page_fault(struct vcpu *v,
- unsigned long va,
- struct cpu_user_regs *regs)
-{
- struct domain *d = v->domain;
- walk_t gw;
- u32 accumulated_gflags;
- gfn_t gfn;
- mfn_t gmfn, sl1mfn=_mfn(0);
- shadow_l1e_t sl1e, *ptr_sl1e;
- paddr_t gpa;
- struct cpu_user_regs emul_regs;
- struct x86_emulate_ctxt emul_ctxt;
- int r, mmio;
- fetch_type_t ft = 0;
-
- //
- // XXX: Need to think about eventually mapping superpages directly in the
- // shadow (when possible), as opposed to splintering them into a
- // bunch of 4K maps.
- //
-
- SHADOW2_PRINTK("d:v=%u:%u va=%#lx err=%u\n",
- v->domain->domain_id, v->vcpu_id, va, regs->error_code);
-
- shadow2_lock(d);
-
- shadow2_audit_tables(v);
-
- if ( guest_walk_tables(v, va, &gw, 1) != 0 )
- {
- SHADOW2_PRINTK("malformed guest pagetable!");
- print_gw(&gw);
- }
-
- sh2_audit_gw(v, &gw);
-
- // We do not look at the gw->l1e, as that will not exist for superpages.
- // Instead, we use the gw->eff_l1e...
- //
- // We need not check all the levels of the guest page table entries for
- // present vs not-present, as the eff_l1e will always be not present if
- // one of the higher level entries is not present.
- //
- if ( unlikely(!(guest_l1e_get_flags(gw.eff_l1e) & _PAGE_PRESENT)) )
- {
- if ( hvm_guest(v) && !shadow2_vcpu_mode_translate(v) )
- {
- /* Not present in p2m map, means this is mmio */
- gpa = va;
- goto mmio;
- }
-
- perfc_incrc(shadow2_fault_bail_not_present);
- goto not_a_shadow_fault;
- }
-
- // All levels of the guest page table are now known to be present.
- accumulated_gflags = accumulate_guest_flags(&gw);
-
- // Check for attempts to access supervisor-only pages from user mode,
- // i.e. ring 3. Such errors are not caused or dealt with by the shadow
- // code.
- //
- if ( (regs->error_code & PFEC_user_mode) &&
- !(accumulated_gflags & _PAGE_USER) )
- {
- /* illegal user-mode access to supervisor-only page */
- perfc_incrc(shadow2_fault_bail_user_supervisor);
- goto not_a_shadow_fault;
- }
-
- // Was it a write fault?
- //
- if ( regs->error_code & PFEC_write_access )
- {
- if ( unlikely(!(accumulated_gflags & _PAGE_RW)) )
- {
- perfc_incrc(shadow2_fault_bail_ro_mapping);
- goto not_a_shadow_fault;
- }
- }
- else // must have been either an insn fetch or read fault
- {
- // Check for NX bit violations: attempts to execute code that is
- // marked "do not execute". Such errors are not caused or dealt with
- // by the shadow code.
- //
- if ( regs->error_code & PFEC_insn_fetch )
- {
- if ( accumulated_gflags & _PAGE_NX_BIT )
- {
- /* NX prevented this code fetch */
- perfc_incrc(shadow2_fault_bail_nx);
- goto not_a_shadow_fault;
- }
- }
- }
-
- /* Is this an MMIO access? */
- gfn = guest_l1e_get_gfn(gw.eff_l1e);
- mmio = ( hvm_guest(v)
- && shadow2_vcpu_mode_translate(v)
- && mmio_space(gfn_to_paddr(gfn)) );
-
- /* For MMIO, the shadow holds the *gfn*; for normal accesses, if holds
- * the equivalent mfn. */
- if ( mmio )
- gmfn = _mfn(gfn_x(gfn));
- else
- {
- gmfn = vcpu_gfn_to_mfn(v, gfn);
- if ( !valid_mfn(gmfn) )
- {
- perfc_incrc(shadow2_fault_bail_bad_gfn);
- SHADOW2_PRINTK("BAD gfn=%"SH2_PRI_gfn" gmfn=%"SH2_PRI_mfn"\n",
- gfn_x(gfn), mfn_x(gmfn));
- goto not_a_shadow_fault;
- }
- }
-
- /* Make sure there is enough free shadow memory to build a chain of
- * shadow tables: one SHADOW2_MAX_ORDER chunk will always be enough
- * to allocate all we need. (We never allocate a top-level shadow
- * on this path, only a 32b l1, pae l2+1 or 64b l3+2+1) */
- shadow2_prealloc(d, SHADOW2_MAX_ORDER);
-
- /* Acquire the shadow. This must happen before we figure out the rights
- * for the shadow entry, since we might promote a page here. */
- // XXX -- this code will need to change somewhat if/when the shadow code
- // can directly map superpages...
- ft = ((regs->error_code & PFEC_write_access) ?
- ft_demand_write : ft_demand_read);
- ptr_sl1e = shadow_get_and_create_l1e(v, &gw, &sl1mfn, ft);
- ASSERT(ptr_sl1e);
-
- /* Calculate the shadow entry */
- if ( ft == ft_demand_write )
- {
- if ( l1e_write_fault(v, &gw, gmfn, &sl1e, mmio) )
- {
- perfc_incrc(shadow2_fault_emulate_write);
- goto emulate;
- }
- }
- else if ( l1e_read_fault(v, &gw, gmfn, &sl1e, mmio) )
- {
- perfc_incrc(shadow2_fault_emulate_read);
- goto emulate;
- }
-
- /* Quick sanity check: we never make an MMIO entry that's got the
- * _PAGE_PRESENT flag set in it. */
- ASSERT(!mmio || !(shadow_l1e_get_flags(sl1e) & _PAGE_PRESENT));
-
- r = shadow_set_l1e(v, ptr_sl1e, sl1e, sl1mfn);
-
- if ( mmio )
- {
- gpa = guest_walk_to_gpa(&gw);
- goto mmio;
- }
-
-#if 0
- if ( !(r & SHADOW2_SET_CHANGED) )
- debugtrace_printk("%s: shadow_set_l1e(va=%p, sl1e=%" SH2_PRI_pte
- ") did not change anything\n",
- __func__, gw.va, l1e_get_intpte(sl1e));
-#endif
-
- perfc_incrc(shadow2_fault_fixed);
- d->arch.shadow2.fault_count++;
- reset_early_unshadow(v);
-
- done:
- sh2_audit_gw(v, &gw);
- unmap_walk(v, &gw);
- SHADOW2_PRINTK("fixed\n");
- shadow2_audit_tables(v);
- shadow2_unlock(d);
- return EXCRET_fault_fixed;
-
- emulate:
-
- /* Take the register set we were called with */
- emul_regs = *regs;
- if ( hvm_guest(v) )
- {
- /* Add the guest's segment selectors, rip, rsp. rflags */
- hvm_store_cpu_guest_regs(v, &emul_regs, NULL);
- }
- emul_ctxt.regs = &emul_regs;
- emul_ctxt.cr2 = va;
- emul_ctxt.mode = hvm_guest(v) ? hvm_guest_x86_mode(v) : X86EMUL_MODE_HOST;
-
- SHADOW2_PRINTK("emulate: eip=%#lx\n", emul_regs.eip);
-
- v->arch.shadow2.propagate_fault = 0;
- if ( x86_emulate_memop(&emul_ctxt, &shadow2_emulator_ops) )
- {
- SHADOW2_PRINTK("emulator failure, unshadowing mfn %#lx\n",
- mfn_x(gmfn));
- perfc_incrc(shadow2_fault_emulate_failed);
- /* If this is actually a page table, then we have a bug, and need
- * to support more operations in the emulator. More likely,
- * though, this is a hint that this page should not be shadowed. */
- shadow2_remove_all_shadows(v, gmfn);
- /* This means that actual missing operations will cause the
- * guest to loop on the same page fault. */
- goto done;
- }
- if ( v->arch.shadow2.propagate_fault )
- {
- /* Emulation triggered another page fault */
- goto not_a_shadow_fault;
- }
-
- /* Emulator has changed the user registers: write back */
- if ( hvm_guest(v) )
- {
- /* Write back the guest's segment selectors, rip, rsp. rflags */
- hvm_load_cpu_guest_regs(v, &emul_regs);
- /* And don't overwrite those in the caller's regs. */
- emul_regs.eip = regs->eip;
- emul_regs.cs = regs->cs;
- emul_regs.eflags = regs->eflags;
- emul_regs.esp = regs->esp;
- emul_regs.ss = regs->ss;
- emul_regs.es = regs->es;
- emul_regs.ds = regs->ds;
- emul_regs.fs = regs->fs;
- emul_regs.gs = regs->gs;
- }
- *regs = emul_regs;
-
- goto done;
-
- mmio:
- perfc_incrc(shadow2_fault_mmio);
- if ( !hvm_apic_support(d) && (gpa >= 0xFEC00000) )
- {
- /* Need to deal with these disabled-APIC accesses, as
- * handle_mmio() apparently does not currently do that. */
- /* TJD: What about it, then? For now, I'm turning this BUG()
- * into a domain_crash() since we don't want to kill Xen. */
- SHADOW2_ERROR("disabled-APIC access: not supported\n.");
- domain_crash(d);
- }
- sh2_audit_gw(v, &gw);
- unmap_walk(v, &gw);
- SHADOW2_PRINTK("mmio\n");
- shadow2_audit_tables(v);
- reset_early_unshadow(v);
- shadow2_unlock(d);
- sh2_log_mmio(v, gpa);
- handle_mmio(va, gpa);
- return EXCRET_fault_fixed;
-
- not_a_shadow_fault:
- sh2_audit_gw(v, &gw);
- unmap_walk(v, &gw);
- SHADOW2_PRINTK("not a shadow fault\n");
- shadow2_audit_tables(v);
- reset_early_unshadow(v);
- shadow2_unlock(d);
- return 0;
-}
-
-
-static int
-sh2_invlpg(struct vcpu *v, unsigned long va)
-/* Called when the guest requests an invlpg. Returns 1 if the invlpg
- * instruction should be issued on the hardware, or 0 if it's safe not
- * to do so. */
-{
- shadow_l2e_t *ptr_sl2e = shadow_get_l2e(v, va);
-
- // XXX -- might be a good thing to prefetch the va into the shadow
-
- // no need to flush anything if there's no SL2...
- //
- if ( !ptr_sl2e )
- return 0;
-
- // If there's nothing shadowed for this particular sl2e, then
- // there is no need to do an invlpg, either...
- //
- if ( !(shadow_l2e_get_flags(*ptr_sl2e) & _PAGE_PRESENT) )
- return 0;
-
- // Check to see if the SL2 is a splintered superpage...
- // If so, then we'll need to flush the entire TLB (because that's
- // easier than invalidating all of the individual 4K pages).
- //
- if ( (mfn_to_page(shadow_l2e_get_mfn(*ptr_sl2e))->count_info &
- PGC_SH2_type_mask) == PGC_SH2_fl1_shadow )
- {
- local_flush_tlb();
- return 0;
- }
-
- return 1;
-}
-
-static unsigned long
-sh2_gva_to_gfn(struct vcpu *v, unsigned long va)
-/* Called to translate a guest virtual address to what the *guest*
- * pagetables would map it to. */
-{
- walk_t gw;
- gfn_t gfn;
-
- guest_walk_tables(v, va, &gw, 0);
- gfn = guest_walk_to_gfn(&gw);
- unmap_walk(v, &gw);
-
- return gfn_x(gfn);
-}
-
-
-static unsigned long
-sh2_gva_to_gpa(struct vcpu *v, unsigned long va)
-/* Called to translate a guest virtual address to what the *guest*
- * pagetables would map it to. */
-{
- unsigned long gfn = sh2_gva_to_gfn(v, va);
- if ( gfn == INVALID_GFN )
- return 0;
- else
- return (gfn << PAGE_SHIFT) | (va & ~PAGE_MASK);
-}
-
-
-// XXX -- should this be in this file?
-// Or should it be moved to shadow2-common.c?
-//
-/* returns a lowmem machine address of the copied HVM L3 root table
- * If clear_res != 0, then clear the PAE-l3 reserved bits in the copy,
- * otherwise blank out any entries with reserved bits in them. */
-#if (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3)
-static unsigned long
-hvm_pae_copy_root(struct vcpu *v, l3_pgentry_t *l3tab, int clear_res)
-{
- int i, f;
- int res = (_PAGE_RW|_PAGE_NX_BIT|_PAGE_USER|_PAGE_ACCESSED|_PAGE_DIRTY);
- l3_pgentry_t new_l3e, *copy = v->arch.hvm_vcpu.hvm_lowmem_l3tab;
- memcpy(copy, l3tab, 4 * sizeof(l3_pgentry_t));
- for ( i = 0; i < 4; i++ )
- {
- f = l3e_get_flags(l3tab[i]);
- if ( (f & _PAGE_PRESENT) && (!(f & res) || clear_res) )
- new_l3e = l3e_from_pfn(l3e_get_pfn(l3tab[i]), f & ~res);
- else
- new_l3e = l3e_empty();
- safe_write_entry(©[i], &new_l3e);
- }
- return __pa(copy);
-}
-#endif
-
-
-static inline void
-sh2_update_linear_entries(struct vcpu *v)
-/* Sync up all the linear mappings for this vcpu's pagetables */
-{
- struct domain *d = v->domain;
-
- /* Linear pagetables in PV guests
- * ------------------------------
- *
- * Guest linear pagetables, which map the guest pages, are at
- * LINEAR_PT_VIRT_START. Shadow linear pagetables, which map the
- * shadows, are at SH_LINEAR_PT_VIRT_START. Most of the time these
- * are set up at shadow creation time, but (of course!) the PAE case
- * is subtler. Normal linear mappings are made by having an entry
- * in the top-level table that points to itself (shadow linear) or
- * to the guest top-level table (guest linear). For PAE, to set up
- * a linear map requires us to copy the four top-level entries into
- * level-2 entries. That means that every time we change a PAE l3e,
- * we need to reflect the change into the copy.
- *
- * Linear pagetables in HVM guests
- * -------------------------------
- *
- * For HVM guests, the linear pagetables are installed in the monitor
- * tables (since we can't put them in the shadow). Shadow linear
- * pagetables, which map the shadows, are at SH_LINEAR_PT_VIRT_START,
- * and we use the linear pagetable slot at LINEAR_PT_VIRT_START for
- * a linear pagetable of the monitor tables themselves. We have
- * the same issue of having to re-copy PAE l3 entries whevever we use
- * PAE shadows.
- *
- * Because HVM guests run on the same monitor tables regardless of the
- * shadow tables in use, the linear mapping of the shadow tables has to
- * be updated every time v->arch.shadow_table changes.
- */
-
- /* Don't try to update the monitor table if it doesn't exist */
- if ( shadow2_mode_external(d)
- && pagetable_get_pfn(v->arch.monitor_table) == 0 )
- return;
-
-#if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 4)
-
- /* For PV, one l4e points at the guest l4, one points at the shadow
- * l4. No maintenance required.
- * For HVM, just need to update the l4e that points to the shadow l4. */
-
- if ( shadow2_mode_external(d) )
- {
- /* Use the linear map if we can; otherwise make a new mapping */
- if ( v == current )
- {
- __linear_l4_table[l4_linear_offset(SH_LINEAR_PT_VIRT_START)] =
- l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table),
- __PAGE_HYPERVISOR);
- }
- else
- {
- l4_pgentry_t *ml4e;
- ml4e = sh2_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
- ml4e[l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
- l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table),
- __PAGE_HYPERVISOR);
- sh2_unmap_domain_page(ml4e);
- }
- }
-
-#elif (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 3)
-
- /* This case only exists in HVM. To give ourselves a linear map of the
- * shadows, we need to extend a PAE shadow to 4 levels. We do this by
- * having a monitor l3 in slot 0 of the monitor l4 table, and
- * copying the PAE l3 entries into it. Then, by having the monitor l4e
- * for shadow pagetables also point to the monitor l4, we can use it
- * to access the shadows. */
-
- if ( shadow2_mode_external(d) )
- {
- /* Install copies of the shadow l3es into the monitor l3 table.
- * The monitor l3 table is hooked into slot 0 of the monitor
- * l4 table, so we use l3 linear indices 0 to 3 */
- shadow_l3e_t *sl3e;
- l3_pgentry_t *ml3e;
- mfn_t l3mfn;
- int i;
-
- /* Use linear mappings if we can; otherwise make new mappings */
- if ( v == current )
- {
- ml3e = __linear_l3_table;
- l3mfn = _mfn(l4e_get_pfn(__linear_l4_table[0]));
-#if GUEST_PAGING_LEVELS == 2
- /* Shadow l3 tables are made up by update_cr3 */
- sl3e = v->arch.hvm_vcpu.hvm_lowmem_l3tab;
-#else
- sl3e = v->arch.shadow_vtable;
-#endif
- }
- else
- {
- l4_pgentry_t *ml4e;
- ml4e = sh2_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
- ASSERT(l4e_get_flags(ml4e[0]) & _PAGE_PRESENT);
- l3mfn = _mfn(l4e_get_pfn(ml4e[0]));
- ml3e = sh2_map_domain_page(l3mfn);
- sh2_unmap_domain_page(ml4e);
-#if GUEST_PAGING_LEVELS == 2
- /* Shadow l3 tables are made up by update_cr3 */
- sl3e = v->arch.hvm_vcpu.hvm_lowmem_l3tab;
-#else
- sl3e = sh2_map_domain_page(pagetable_get_mfn(v->arch.shadow_table));
-#endif
- }
-
- for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
- {
- ml3e[i] =
- (shadow_l3e_get_flags(sl3e[i]) & _PAGE_PRESENT)
- ? l3e_from_pfn(mfn_x(shadow_l3e_get_mfn(sl3e[i])),
- __PAGE_HYPERVISOR)
- : l3e_empty();
- }
-
- if ( v != current )
- {
- sh2_unmap_domain_page(ml3e);
-#if GUEST_PAGING_LEVELS != 2
- sh2_unmap_domain_page(sl3e);
-#endif
- }
- }
-
-#elif CONFIG_PAGING_LEVELS == 3
-
- /* PV: need to copy the guest's l3 entries into the guest-linear-map l2
- * entries in the shadow, and the shadow's l3 entries into the
- * shadow-linear-map l2 entries in the shadow. This is safe to do
- * because Xen does not let guests share high-slot l2 tables between l3s,
- * so we know we're not treading on anyone's toes.
- *
- * HVM: need to copy the shadow's l3 entries into the
- * shadow-linear-map l2 entries in the monitor table. This is safe
- * because we have one monitor table for each vcpu. The monitor's
- * own l3es don't need to be copied because they never change.
- * XXX That might change if we start stuffing things into the rest
- * of the monitor's virtual address space.
- */
- {
- l2_pgentry_t *l2e, new_l2e;
- shadow_l3e_t *guest_l3e = NULL, *shadow_l3e;
- int i;
-
-#if GUEST_PAGING_LEVELS == 2
- /* Shadow l3 tables were built by update_cr3 */
- if ( shadow2_mode_external(d) )
- shadow_l3e = v->arch.hvm_vcpu.hvm_lowmem_l3tab;
- else
- BUG(); /* PV 2-on-3 is not supported yet */
-
-#else /* GUEST_PAGING_LEVELS == 3 */
-
- /* Use local vcpu's mappings if we can; otherwise make new mappings */
- if ( v == current )
- {
- shadow_l3e = v->arch.shadow_vtable;
- if ( !shadow2_mode_external(d) )
- guest_l3e = v->arch.guest_vtable;
- }
- else
- {
- mfn_t smfn;
- int idx;
-
- /* Map the shadow l3 */
- smfn = pagetable_get_mfn(v->arch.shadow_table);
- idx = shadow_l3_index(&smfn, guest_index(v->arch.shadow_vtable));
- shadow_l3e = sh2_map_domain_page(smfn);
- shadow_l3e += idx;
- if ( !shadow2_mode_external(d) )
- {
- /* Also the guest l3 */
- mfn_t gmfn = pagetable_get_mfn(v->arch.guest_table);
- guest_l3e = sh2_map_domain_page(gmfn);
- guest_l3e += guest_index(v->arch.guest_vtable);
- }
- }
-#endif /* GUEST_PAGING_LEVELS */
-
- /* Choose where to write the entries, using linear maps if possible */
- if ( v == current && shadow2_mode_external(d) )
- {
- /* From the monitor tables, it's safe to use linear maps to update
- * monitor l2s */
- l2e = __linear_l2_table + (3 * L2_PAGETABLE_ENTRIES);
- }
- else if ( shadow2_mode_external(d) )
- {
- /* Map the monitor table's high l2 */
- l3_pgentry_t *l3e;
- l3e = sh2_map_domain_page(
- pagetable_get_mfn(v->arch.monitor_table));
- ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
- l2e = sh2_map_domain_page(_mfn(l3e_get_pfn(l3e[3])));
- sh2_unmap_domain_page(l3e);
- }
- else
- {
- /* Map the shadow table's high l2 */
- ASSERT(shadow_l3e_get_flags(shadow_l3e[3]) & _PAGE_PRESENT);
- l2e = sh2_map_domain_page(shadow_l3e_get_mfn(shadow_l3e[3]));
- }
-
-
- if ( !shadow2_mode_external(d) )
- {
- /* Write linear mapping of guest. */
- for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
- {
- new_l2e = (shadow_l3e_get_flags(guest_l3e[i]) & _PAGE_PRESENT)
- ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(guest_l3e[i])),
- __PAGE_HYPERVISOR)
- : l2e_empty();
- safe_write_entry(
- &l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i],
- &new_l2e);
- }
- }
-
- /* Write linear mapping of shadow. */
- for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
- {
- new_l2e = (shadow_l3e_get_flags(shadow_l3e[i]) & _PAGE_PRESENT)
- ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(shadow_l3e[i])),
- __PAGE_HYPERVISOR)
- : l2e_empty();
- safe_write_entry(
- &l2e[l2_table_offset(SH_LINEAR_PT_VIRT_START) + i],
- &new_l2e);
- }
-
- if ( v != current || !shadow2_mode_external(d) )
- sh2_unmap_domain_page(l2e);
-
-#if GUEST_PAGING_LEVELS == 3
- if ( v != current)
- {
- sh2_unmap_domain_page(shadow_l3e);
- if ( !shadow2_mode_external(d) )
- sh2_unmap_domain_page(guest_l3e);
- }
-#endif
- }
-
-#elif CONFIG_PAGING_LEVELS == 2
-
- /* For PV, one l2e points at the guest l2, one points at the shadow
- * l2. No maintenance required.
- * For HVM, just need to update the l2e that points to the shadow l2. */
-
- if ( shadow2_mode_external(d) )
- {
- /* Use the linear map if we can; otherwise make a new mapping */
- if ( v == current )
- {
- __linear_l2_table[l2_linear_offset(SH_LINEAR_PT_VIRT_START)] =
- l2e_from_pfn(pagetable_get_pfn(v->arch.shadow_table),
- __PAGE_HYPERVISOR);
- }
- else
- {
- l2_pgentry_t *ml2e;
- ml2e = sh2_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
- ml2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
- l2e_from_pfn(pagetable_get_pfn(v->arch.shadow_table),
- __PAGE_HYPERVISOR);
- sh2_unmap_domain_page(ml2e);
- }
- }
-
-#else
-#error this should not happen
-#endif
-}
-
-
-// XXX -- should this be in this file?
-// Or should it be moved to shadow2-common.c?
-//
-#if (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3)
-void sh2_pae_recopy(struct domain *d)
-/* Called whenever we write to the l3 entries of a PAE pagetable which
- * is currently in use. Each vcpu that is using the table needs to
- * resync its copies of the l3s in linear maps and any low-memory
- * copies it might have made for fitting into 32bit CR3.
- * Since linear maps are also resynced when we change CR3, we don't
- * need to worry about changes to PAE l3es that are not currently in use.*/
-{
- struct vcpu *v;
- cpumask_t flush_mask = CPU_MASK_NONE;
- ASSERT(shadow2_lock_is_acquired(d));
-
- for_each_vcpu(d, v)
- {
- if ( !v->arch.shadow2.pae_flip_pending )
- continue;
-
- cpu_set(v->processor, flush_mask);
-
- SHADOW2_PRINTK("d=%u v=%u\n", v->domain->domain_id, v->vcpu_id);
-
- /* This vcpu has a copy in its linear maps */
- sh2_update_linear_entries(v);
- if ( hvm_guest(v) )
- {
- /* This vcpu has a copy in its HVM PAE l3 */
- v->arch.hvm_vcpu.hw_cr3 =
- hvm_pae_copy_root(v, v->arch.shadow_vtable,
- !shadow2_vcpu_mode_translate(v));
- }
-#if CONFIG_PAGING_LEVELS == 3
- else
- {
- /* This vcpu might have copied the l3 to below 4GB */
- if ( v->arch.cr3 >> PAGE_SHIFT
- != pagetable_get_pfn(v->arch.shadow_table) )
- {
- /* Recopy to where that copy is. */
- int i;
- l3_pgentry_t *dst, *src;
- dst = __va(v->arch.cr3 & ~0x1f); /* Mask cache control bits */
- src = v->arch.shadow_vtable;
- for ( i = 0 ; i < 4 ; i++ )
- safe_write_entry(dst + i, src + i);
- }
- }
-#endif
- v->arch.shadow2.pae_flip_pending = 0;
- }
-
- flush_tlb_mask(flush_mask);
-}
-#endif /* (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3) */
-
-
-/* removes:
- * vcpu->arch.guest_vtable
- * vcpu->arch.shadow_table
- * vcpu->arch.shadow_vtable
- * Does all appropriate management/bookkeeping/refcounting/etc...
- */
-static void
-sh2_detach_old_tables(struct vcpu *v)
-{
- mfn_t smfn;
-
- ////
- //// vcpu->arch.guest_vtable
- ////
- if ( (shadow2_mode_external(v->domain) || (GUEST_PAGING_LEVELS == 3)) &&
- v->arch.guest_vtable )
- {
- // Q: why does this need to use (un)map_domain_page_*global* ?
- sh2_unmap_domain_page_global(v->arch.guest_vtable);
- v->arch.guest_vtable = NULL;
- }
-
- ////
- //// vcpu->arch.shadow_table
- ////
- smfn = pagetable_get_mfn(v->arch.shadow_table);
- if ( mfn_x(smfn) )
- {
- ASSERT(v->arch.shadow_vtable);
-
-#if GUEST_PAGING_LEVELS == 3
- // PAE guests do not (necessarily) use an entire page for their
- // 4-entry L3s, so we have to deal with them specially.
- //
- sh2_put_ref_l3_subshadow(v, v->arch.shadow_vtable, smfn);
-#else
- sh2_put_ref(v, smfn, 0);
-#endif
-
-#if (SHADOW_PAGING_LEVELS == 3) && (GUEST_PAGING_LEVELS == 3)
- {
- struct pae_l3_bookkeeping *info =
- sl3p_to_info(v->arch.shadow_vtable);
- ASSERT(test_bit(v->vcpu_id, &info->vcpus));
- clear_bit(v->vcpu_id, &info->vcpus);
- }
-#endif
- v->arch.shadow_table = pagetable_null();
- }
-
- ////
- //// vcpu->arch.shadow_vtable
- ////
- if ( (shadow2_mode_external(v->domain) || (GUEST_PAGING_LEVELS == 3)) &&
- v->arch.shadow_vtable )
- {
- // Q: why does this need to use (un)map_domain_page_*global* ?
- //
- sh2_unmap_domain_page_global(v->arch.shadow_vtable);
- v->arch.shadow_vtable = NULL;
- }
-}
-
-static void
-sh2_update_cr3(struct vcpu *v)
-/* Updates vcpu->arch.shadow_table after the guest has changed CR3.
- * Paravirtual guests should set v->arch.guest_table (and guest_table_user,
- * if appropriate).
- * HVM guests should also set hvm_get_guest_cntl_reg(v, 3)...
- */
-{
- struct domain *d = v->domain;
- mfn_t gmfn, smfn;
-#if GUEST_PAGING_LEVELS == 3
- u32 guest_idx=0;
-#endif
-
- ASSERT(shadow2_lock_is_acquired(v->domain));
- ASSERT(v->arch.shadow2.mode);
-
- ////
- //// vcpu->arch.guest_table is already set
- ////
-
-#ifndef NDEBUG
- /* Double-check that the HVM code has sent us a sane guest_table */
- if ( hvm_guest(v) )
- {
- gfn_t gfn;
-
- ASSERT(shadow2_mode_external(d));
-
- // Is paging enabled on this vcpu?
- if ( shadow2_vcpu_mode_translate(v) )
- {
- gfn = _gfn(paddr_to_pfn(hvm_get_guest_ctrl_reg(v, 3)));
- gmfn = vcpu_gfn_to_mfn(v, gfn);
- ASSERT(valid_mfn(gmfn));
- ASSERT(pagetable_get_pfn(v->arch.guest_table) == mfn_x(gmfn));
- }
- else
- {
- /* Paging disabled: guest_table points at (part of) p2m */
-#if SHADOW_PAGING_LEVELS != 3 /* in 3-on-4, guest-table is in slot 0 of p2m */
- /* For everything else, they sould be the same */
- ASSERT(v->arch.guest_table.pfn == d->arch.phys_table.pfn);
-#endif
- }
- }
-#endif
-
- SHADOW2_PRINTK("d=%u v=%u guest_table=%05lx\n",
- d->domain_id, v->vcpu_id,
- (unsigned long)pagetable_get_pfn(v->arch.guest_table));
-
-#if GUEST_PAGING_LEVELS == 4
- if ( !(v->arch.flags & TF_kernel_mode) )
- gmfn = pagetable_get_mfn(v->arch.guest_table_user);
- else
-#endif
- gmfn = pagetable_get_mfn(v->arch.guest_table);
-
- sh2_detach_old_tables(v);
-
- if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) )
- {
- ASSERT(v->arch.cr3 == 0);
- return;
- }
-
- ////
- //// vcpu->arch.guest_vtable
- ////
- if ( shadow2_mode_external(d) )
- {
-#if GUEST_PAGING_LEVELS == 3
- if ( shadow2_vcpu_mode_translate(v) )
- /* Paging enabled: find where in the page the l3 table is */
- guest_idx = guest_index((void *)hvm_get_guest_ctrl_reg(v, 3));
- else
- /* Paging disabled: l3 is at the start of a page (in the p2m) */
- guest_idx = 0;
-
- // Ignore the low 2 bits of guest_idx -- they are really just
- // cache control.
- guest_idx &= ~3;
- // XXX - why does this need a global map?
- v->arch.guest_vtable =
- (guest_l3e_t *)sh2_map_domain_page_global(gmfn) + guest_idx;
-#else
- // XXX - why does this need a global map?
- v->arch.guest_vtable = sh2_map_domain_page_global(gmfn);
-#endif
- }
- else
- {
-#ifdef __x86_64__
- v->arch.guest_vtable = __linear_l4_table;
-#elif GUEST_PAGING_LEVELS == 3
- // XXX - why does this need a global map?
- v->arch.guest_vtable = sh2_map_domain_page_global(gmfn);
-#else
- v->arch.guest_vtable = __linear_l2_table;
-#endif
- }
-
-#if 0
- printk("%s %s %d gmfn=%05lx guest_vtable=%p\n",
- __func__, __FILE__, __LINE__, gmfn, v->arch.guest_vtable);
-#endif
-
- ////
- //// vcpu->arch.shadow_table
- ////
- smfn = get_shadow_status(v, gmfn, PGC_SH2_guest_root_type);
- if ( valid_mfn(smfn) )
- {
- /* Pull this root shadow to the front of the list of roots. */
- list_del(&mfn_to_page(smfn)->list);
- list_add(&mfn_to_page(smfn)->list, &d->arch.shadow2.toplevel_shadows);
- }
- else
- {
- /* This guest MFN is a pagetable. Must revoke write access. */
- if ( shadow2_remove_write_access(v, gmfn, GUEST_PAGING_LEVELS, 0)
- != 0 )
- flush_tlb_mask(d->domain_dirty_cpumask);
- /* Make sure there's enough free shadow memory. */
- shadow2_prealloc(d, SHADOW2_MAX_ORDER);
- /* Shadow the page. */
- smfn = sh2_make_shadow(v, gmfn, PGC_SH2_guest_root_type);
- list_add(&mfn_to_page(smfn)->list, &d->arch.shadow2.toplevel_shadows);
- }
- ASSERT(valid_mfn(smfn));
- v->arch.shadow_table = pagetable_from_mfn(smfn);
-
-#if SHADOW2_OPTIMIZATIONS & SH2OPT_EARLY_UNSHADOW
- /* Once again OK to unhook entries from this table if we see fork/exit */
- ASSERT(sh2_mfn_is_a_page_table(gmfn));
- mfn_to_page(gmfn)->shadow2_flags &= ~SH2F_unhooked_mappings;
-#endif
-
-
- ////
- //// vcpu->arch.shadow_vtable
- ////
- if ( shadow2_mode_external(d) )
- {
-#if (SHADOW_PAGING_LEVELS == 3) && (GUEST_PAGING_LEVELS == 3)
- mfn_t adjusted_smfn = smfn;
- u32 shadow_idx = shadow_l3_index(&adjusted_smfn, guest_idx);
- // Q: why does this need to use (un)map_domain_page_*global* ?
- v->arch.shadow_vtable =
- (shadow_l3e_t *)sh2_map_domain_page_global(adjusted_smfn) +
- shadow_idx;
-#else
- // Q: why does this need to use (un)map_domain_page_*global* ?
- v->arch.shadow_vtable = sh2_map_domain_page_global(smfn);
-#endif
- }
- else
- {
-#if SHADOW_PAGING_LEVELS == 4
- v->arch.shadow_vtable = __sh2_linear_l4_table;
-#elif GUEST_PAGING_LEVELS == 3
- // XXX - why does this need a global map?
- v->arch.shadow_vtable = sh2_map_domain_page_global(smfn);
-#else
- v->arch.shadow_vtable = __sh2_linear_l2_table;
-#endif
- }
-
- ////
- //// Take a ref to the new shadow table, and pin it.
- ////
- //
- // This ref is logically "held" by v->arch.shadow_table entry itself.
- // Release the old ref.
- //
-#if GUEST_PAGING_LEVELS == 3
- // PAE guests do not (necessarily) use an entire page for their
- // 4-entry L3s, so we have to deal with them specially.
- //
- // XXX - might want to revisit this if/when we do multiple compilation for
- // HVM-vs-PV guests, as PAE PV guests could get away without doing
- // subshadows.
- //
- sh2_get_ref_l3_subshadow(v->arch.shadow_vtable, smfn);
- sh2_pin_l3_subshadow(v->arch.shadow_vtable, smfn);
-#else
- sh2_get_ref(smfn, 0);
- sh2_pin(smfn);
-#endif
-
-#if (SHADOW_PAGING_LEVELS == 3) && (GUEST_PAGING_LEVELS == 3)
- // PAE 3-on-3 shadows have to keep track of which vcpu's are using
- // which l3 subshadow, in order handle the SHADOW2_SET_L3PAE_RECOPY
- // case from validate_gl3e(). Search for SHADOW2_SET_L3PAE_RECOPY
- // in the code for more info.
- //
- {
- struct pae_l3_bookkeeping *info =
- sl3p_to_info(v->arch.shadow_vtable);
- ASSERT(!test_bit(v->vcpu_id, &info->vcpus));
- set_bit(v->vcpu_id, &info->vcpus);
- }
-#endif
-
- debugtrace_printk("%s cr3 gmfn=%05lx smfn=%05lx\n",
- __func__, gmfn, smfn);
-
- ///
- /// v->arch.cr3 and, if appropriate, v->arch.hvm_vcpu.hw_cr3
- ///
- if ( shadow2_mode_external(d) )
- {
- ASSERT(hvm_guest(v));
- make_cr3(v, pagetable_get_pfn(v->arch.monitor_table));
-
-#if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2)
-#if SHADOW_PAGING_LEVELS != 3
-#error unexpected combination of GUEST and SHADOW paging levels
-#endif
- /* 2-on-3: make a PAE l3 table that points at the four-page l2 */
- {
- mfn_t smfn = pagetable_get_mfn(v->arch.shadow_table);
- int i;
-
- ASSERT(v->arch.hvm_vcpu.hw_cr3 ==
- virt_to_maddr(v->arch.hvm_vcpu.hvm_lowmem_l3tab));
- for (i = 0; i < 4; i++)
- {
- v->arch.hvm_vcpu.hvm_lowmem_l3tab[i] =
- shadow_l3e_from_mfn(_mfn(mfn_x(smfn)+i), _PAGE_PRESENT);
- }
- }
-#elif (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3)
- /* 3-on-3: copy the shadow l3 to slots that are below 4GB.
- * If paging is disabled, clear l3e reserved bits; otherwise
- * remove entries that have reserved bits set. */
- v->arch.hvm_vcpu.hw_cr3 =
- hvm_pae_copy_root(v, v->arch.shadow_vtable,
- !shadow2_vcpu_mode_translate(v));
-#else
- /* 2-on-2 or 4-on-4: just put the shadow top-level into cr3 */
- v->arch.hvm_vcpu.hw_cr3 =
- pagetable_get_paddr(v->arch.shadow_table);
-#endif
- }
- else // not shadow2_mode_external...
- {
- /* We don't support PV except guest == shadow == config levels */
- BUG_ON(GUEST_PAGING_LEVELS != SHADOW_PAGING_LEVELS);
- make_cr3(v, pagetable_get_pfn(v->arch.shadow_table));
- }
-
- /* Fix up the linear pagetable mappings */
- sh2_update_linear_entries(v);
-}
-
-
-/**************************************************************************/
-/* Functions to revoke guest rights */
-
-#if SHADOW2_OPTIMIZATIONS & SH2OPT_WRITABLE_HEURISTIC
-static int sh2_guess_wrmap(struct vcpu *v, unsigned long vaddr, mfn_t gmfn)
-/* Look up this vaddr in the current shadow and see if it's a writeable
- * mapping of this gmfn. If so, remove it. Returns 1 if it worked. */
-{
- shadow_l1e_t sl1e, *sl1p;
- shadow_l2e_t *sl2p;
-#if GUEST_PAGING_LEVELS >= 3
- shadow_l3e_t *sl3p;
-#if GUEST_PAGING_LEVELS >= 4
- shadow_l4e_t *sl4p;
-#endif
-#endif
- mfn_t sl1mfn;
-
-
- /* Carefully look in the shadow linear map for the l1e we expect */
- if ( v->arch.shadow_vtable == NULL ) return 0;
-#if GUEST_PAGING_LEVELS >= 4
- sl4p = sh2_linear_l4_table(v) + shadow_l4_linear_offset(vaddr);
- if ( !(shadow_l4e_get_flags(*sl4p) & _PAGE_PRESENT) )
- return 0;
- sl3p = sh2_linear_l3_table(v) + shadow_l3_linear_offset(vaddr);
- if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
- return 0;
-#elif GUEST_PAGING_LEVELS == 3
- sl3p = ((shadow_l3e_t *) v->arch.shadow_vtable)
- + shadow_l3_linear_offset(vaddr);
- if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
- return 0;
-#endif
- sl2p = sh2_linear_l2_table(v) + shadow_l2_linear_offset(vaddr);
- if ( !(shadow_l2e_get_flags(*sl2p) & _PAGE_PRESENT) )
- return 0;
- sl1p = sh2_linear_l1_table(v) + shadow_l1_linear_offset(vaddr);
- sl1e = *sl1p;
- if ( ((shadow_l1e_get_flags(sl1e) & (_PAGE_PRESENT|_PAGE_RW))
- != (_PAGE_PRESENT|_PAGE_RW))
- || (mfn_x(shadow_l1e_get_mfn(sl1e)) != mfn_x(gmfn)) )
- return 0;
-
- /* Found it! Need to remove its write permissions. */
- sl1mfn = shadow_l2e_get_mfn(*sl2p);
- sl1e = shadow_l1e_remove_flags(sl1e, _PAGE_RW);
- shadow_set_l1e(v, sl1p, sl1e, sl1mfn);
- return 1;
-}
-#endif
-
-int sh2_remove_write_access(struct vcpu *v, mfn_t sl1mfn, mfn_t readonly_mfn)
-/* Excises all writeable mappings to readonly_mfn from this l1 shadow table */
-{
- shadow_l1e_t *sl1e;
- int done = 0;
- int flags;
-
- SHADOW2_FOREACH_L1E(sl1mfn, sl1e, 0, done,
- {
- flags = shadow_l1e_get_flags(*sl1e);
- if ( (flags & _PAGE_PRESENT)
- && (flags & _PAGE_RW)
- && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(readonly_mfn)) )
- {
- shadow_set_l1e(v, sl1e, shadow_l1e_empty(), sl1mfn);
- if ( (mfn_to_page(readonly_mfn)->u.inuse.type_info
- & PGT_count_mask) == 0 )
- /* This breaks us cleanly out of the FOREACH macro */
- done = 1;
- }
- });
- return done;
-}
-
-
-int sh2_remove_all_mappings(struct vcpu *v, mfn_t sl1mfn, mfn_t target_mfn)
-/* Excises all mappings to guest frame from this shadow l1 table */
-{
- shadow_l1e_t *sl1e;
- int done = 0;
- int flags;
-
- SHADOW2_FOREACH_L1E(sl1mfn, sl1e, 0, done,
- {
- flags = shadow_l1e_get_flags(*sl1e);
- if ( (flags & _PAGE_PRESENT)
- && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(target_mfn)) )
- {
- shadow_set_l1e(v, sl1e, shadow_l1e_empty(), sl1mfn);
- if ( (mfn_to_page(target_mfn)->count_info & PGC_count_mask) == 0 )
- /* This breaks us cleanly out of the FOREACH macro */
- done = 1;
- }
- });
- return done;
-}
-
-/**************************************************************************/
-/* Functions to excise all pointers to shadows from higher-level shadows. */
-
-void sh2_clear_shadow_entry(struct vcpu *v, void *ep, mfn_t smfn)
-/* Blank out a single shadow entry */
-{
- switch (mfn_to_page(smfn)->count_info & PGC_SH2_type_mask)
- {
- case PGC_SH2_l1_shadow:
- shadow_set_l1e(v, ep, shadow_l1e_empty(), smfn); break;
- case PGC_SH2_l2_shadow:
-#if GUEST_PAGING_LEVELS == 3
- case PGC_SH2_l2h_shadow:
-#endif
- shadow_set_l2e(v, ep, shadow_l2e_empty(), smfn); break;
-#if GUEST_PAGING_LEVELS >= 3
- case PGC_SH2_l3_shadow:
- shadow_set_l3e(v, ep, shadow_l3e_empty(), smfn); break;
-#if GUEST_PAGING_LEVELS >= 4
- case PGC_SH2_l4_shadow:
- shadow_set_l4e(v, ep, shadow_l4e_empty(), smfn); break;
-#endif
-#endif
- default: BUG(); /* Called with the wrong kind of shadow. */
- }
-}
-
-int sh2_remove_l1_shadow(struct vcpu *v, mfn_t sl2mfn, mfn_t sl1mfn)
-/* Remove all mappings of this l1 shadow from this l2 shadow */
-{
- shadow_l2e_t *sl2e;
- int done = 0;
- int flags;
-#if GUEST_PAGING_LEVELS != 4
- int xen_mappings = !shadow2_mode_external(v->domain);
-#endif
-
- SHADOW2_FOREACH_L2E(sl2mfn, sl2e, 0, done, xen_mappings,
- {
- flags = shadow_l2e_get_flags(*sl2e);
- if ( (flags & _PAGE_PRESENT)
- && (mfn_x(shadow_l2e_get_mfn(*sl2e)) == mfn_x(sl1mfn)) )
- {
- shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
- if ( (mfn_to_page(sl1mfn)->count_info & PGC_SH2_type_mask) == 0 )
- /* This breaks us cleanly out of the FOREACH macro */
- done = 1;
- }
- });
- return done;
-}
-
-#if GUEST_PAGING_LEVELS >= 3
-int sh2_remove_l2_shadow(struct vcpu *v, mfn_t sl3mfn, mfn_t sl2mfn)
-/* Remove all mappings of this l2 shadow from this l3 shadow */
-{
- shadow_l3e_t *sl3e;
- int done = 0;
- int flags;
-
- SHADOW2_FOREACH_L3E(sl3mfn, sl3e, 0, done,
- {
- flags = shadow_l3e_get_flags(*sl3e);
- if ( (flags & _PAGE_PRESENT)
- && (mfn_x(shadow_l3e_get_mfn(*sl3e)) == mfn_x(sl2mfn)) )
- {
- shadow_set_l3e(v, sl3e, shadow_l3e_empty(), sl3mfn);
- if ( (mfn_to_page(sl2mfn)->count_info & PGC_SH2_type_mask) == 0 )
- /* This breaks us cleanly out of the FOREACH macro */
- done = 1;
- }
- });
- return done;
-}
-
-#if GUEST_PAGING_LEVELS >= 4
-int sh2_remove_l3_shadow(struct vcpu *v, mfn_t sl4mfn, mfn_t sl3mfn)
-/* Remove all mappings of this l3 shadow from this l4 shadow */
-{
- shadow_l4e_t *sl4e;
- int done = 0;
- int flags, xen_mappings = !shadow2_mode_external(v->domain);
-
- SHADOW2_FOREACH_L4E(sl4mfn, sl4e, 0, done, xen_mappings,
- {
- flags = shadow_l4e_get_flags(*sl4e);
- if ( (flags & _PAGE_PRESENT)
- && (mfn_x(shadow_l4e_get_mfn(*sl4e)) == mfn_x(sl3mfn)) )
- {
- shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
- if ( (mfn_to_page(sl3mfn)->count_info & PGC_SH2_type_mask) == 0 )
- /* This breaks us cleanly out of the FOREACH macro */
- done = 1;
- }
- });
- return done;
-}
-#endif /* 64bit guest */
-#endif /* PAE guest */
-
-/**************************************************************************/
-/* Handling HVM guest writes to pagetables */
-
-/* Check that the user is allowed to perform this write.
- * Returns a mapped pointer to write to, and the mfn it's on,
- * or NULL for error. */
-static inline void * emulate_map_dest(struct vcpu *v,
- unsigned long vaddr,
- struct x86_emulate_ctxt *ctxt,
- mfn_t *mfnp)
-{
- walk_t gw;
- u32 flags;
- gfn_t gfn;
- mfn_t mfn;
-
- guest_walk_tables(v, vaddr, &gw, 1);
- flags = accumulate_guest_flags(&gw);
- gfn = guest_l1e_get_gfn(gw.eff_l1e);
- mfn = vcpu_gfn_to_mfn(v, gfn);
- sh2_audit_gw(v, &gw);
- unmap_walk(v, &gw);
-
- if ( !(flags & _PAGE_PRESENT)
- || !(flags & _PAGE_RW)
- || (!(flags & _PAGE_USER) && ring_3(ctxt->regs)) )
- {
- /* This write would have faulted even on bare metal */
- v->arch.shadow2.propagate_fault = 1;
- return NULL;
- }
-
- if ( !valid_mfn(mfn) )
- {
- /* Attempted a write to a bad gfn. This should never happen:
- * after all, we're here because this write is to a page table. */
- BUG();
- }
-
- ASSERT(sh2_mfn_is_a_page_table(mfn));
- *mfnp = mfn;
- return sh2_map_domain_page(mfn) + (vaddr & ~PAGE_MASK);
-}
-
-int
-sh2_x86_emulate_write(struct vcpu *v, unsigned long vaddr, void *src,
- u32 bytes, struct x86_emulate_ctxt *ctxt)
-{
- ASSERT(shadow2_lock_is_acquired(v->domain));
- while ( bytes > 0 )
- {
- mfn_t mfn;
- int bytes_on_page;
- void *addr;
-
- bytes_on_page = PAGE_SIZE - (vaddr & ~PAGE_MASK);
- if ( bytes_on_page > bytes )
- bytes_on_page = bytes;
-
- if ( (addr = emulate_map_dest(v, vaddr, ctxt, &mfn)) == NULL )
- return X86EMUL_PROPAGATE_FAULT;
- memcpy(addr, src, bytes_on_page);
- shadow2_validate_guest_pt_write(v, mfn, addr, bytes_on_page);
- bytes -= bytes_on_page;
- /* If we are writing zeros to this page, might want to unshadow */
- if ( *(u8 *)addr == 0 )
- check_for_early_unshadow(v, mfn);
- sh2_unmap_domain_page(addr);
- }
- shadow2_audit_tables(v);
- return X86EMUL_CONTINUE;
-}
-
-int
-sh2_x86_emulate_cmpxchg(struct vcpu *v, unsigned long vaddr,
- unsigned long old, unsigned long new,
- unsigned int bytes, struct x86_emulate_ctxt *ctxt)
-{
- mfn_t mfn;
- void *addr;
- unsigned long prev;
- int rv = X86EMUL_CONTINUE;
-
- ASSERT(shadow2_lock_is_acquired(v->domain));
- ASSERT(bytes <= sizeof (unsigned long));
-
- if ( (addr = emulate_map_dest(v, vaddr, ctxt, &mfn)) == NULL )
- return X86EMUL_PROPAGATE_FAULT;
-
- switch (bytes)
- {
- case 1: prev = cmpxchg(((u8 *)addr), old, new); break;
- case 2: prev = cmpxchg(((u16 *)addr), old, new); break;
- case 4: prev = cmpxchg(((u32 *)addr), old, new); break;
- case 8: prev = cmpxchg(((u64 *)addr), old, new); break;
- default:
- SHADOW2_PRINTK("cmpxchg of size %i is not supported\n", bytes);
- prev = ~old;
- }
-
- if ( (prev == old) )
- shadow2_validate_guest_pt_write(v, mfn, addr, bytes);
- else
- rv = X86EMUL_CMPXCHG_FAILED;
-
- SHADOW2_DEBUG(EMULATE, "va %#lx was %#lx expected %#lx"
- " wanted %#lx now %#lx bytes %u\n",
- vaddr, prev, old, new, *(unsigned long *)addr, bytes);
-
- /* If we are writing zeros to this page, might want to unshadow */
- if ( *(u8 *)addr == 0 )
- check_for_early_unshadow(v, mfn);
-
- sh2_unmap_domain_page(addr);
- shadow2_audit_tables(v);
- check_for_early_unshadow(v, mfn);
- return rv;
-}
-
-int
-sh2_x86_emulate_cmpxchg8b(struct vcpu *v, unsigned long vaddr,
- unsigned long old_lo, unsigned long old_hi,
- unsigned long new_lo, unsigned long new_hi,
- struct x86_emulate_ctxt *ctxt)
-{
- mfn_t mfn;
- void *addr;
- u64 old, new, prev;
- int rv = X86EMUL_CONTINUE;
-
- ASSERT(shadow2_lock_is_acquired(v->domain));
-
- if ( (addr = emulate_map_dest(v, vaddr, ctxt, &mfn)) == NULL )
- return X86EMUL_PROPAGATE_FAULT;
-
- old = (((u64) old_hi) << 32) | (u64) old_lo;
- new = (((u64) new_hi) << 32) | (u64) new_lo;
- prev = cmpxchg(((u64 *)addr), old, new);
-
- if ( (prev == old) )
- shadow2_validate_guest_pt_write(v, mfn, addr, 8);
- else
- rv = X86EMUL_CMPXCHG_FAILED;
-
- /* If we are writing zeros to this page, might want to unshadow */
- if ( *(u8 *)addr == 0 )
- check_for_early_unshadow(v, mfn);
-
- sh2_unmap_domain_page(addr);
- shadow2_audit_tables(v);
- check_for_early_unshadow(v, mfn);
- return rv;
-}
-
-
-/**************************************************************************/
-/* Audit tools */
-
-#if SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES
-
-#define AUDIT_FAIL(_level, _fmt, _a...) do { \
- printk("Shadow2 %u-on-%u audit failed at level %i, index %i\n" \
- "gl" #_level "mfn = %" SH2_PRI_mfn \
- " sl" #_level "mfn = %" SH2_PRI_mfn \
- " &gl" #_level "e = %p &sl" #_level "e = %p" \
- " gl" #_level "e = %" SH2_PRI_gpte \
- " sl" #_level "e = %" SH2_PRI_pte "\nError: " _fmt "\n", \
- GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \
- _level, guest_index(gl ## _level ## e), \
- mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \
- gl ## _level ## e, sl ## _level ## e, \
- gl ## _level ## e->l ## _level, sl ## _level ## e->l ## _level, \
- ##_a); \
- BUG(); \
- done = 1; \
-} while (0)
-
-
-static char * sh2_audit_flags(struct vcpu *v, int level,
- int gflags, int sflags)
-/* Common code for auditing flag bits */
-{
- if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_PRESENT) )
- return "shadow is present but guest is not present";
- if ( (sflags & _PAGE_GLOBAL) && !hvm_guest(v) )
- return "global bit set in PV shadow";
- if ( (level == 1 || (level == 2 && (gflags & _PAGE_PSE)))
- && ((sflags & _PAGE_DIRTY) && !(gflags & _PAGE_DIRTY)) )
- return "dirty bit not propagated";
- if ( level == 2 && (sflags & _PAGE_PSE) )
- return "PS bit set in shadow";
-#if SHADOW_PAGING_LEVELS == 3
- if ( level == 3 ) return NULL; /* All the other bits are blank in PAEl3 */
-#endif
- if ( (sflags & _PAGE_USER) != (gflags & _PAGE_USER) )
- return "user/supervisor bit does not match";
- if ( (sflags & _PAGE_NX_BIT) != (gflags & _PAGE_NX_BIT) )
- return "NX bit does not match";
- if ( (sflags & _PAGE_RW) && !(gflags & _PAGE_RW) )
- return "shadow grants write access but guest does not";
- if ( (sflags & _PAGE_ACCESSED) && !(gflags & _PAGE_ACCESSED) )
- return "accessed bit not propagated";
- return NULL;
-}
-
-static inline mfn_t
-audit_gfn_to_mfn(struct vcpu *v, gfn_t gfn, mfn_t gmfn)
-/* Convert this gfn to an mfn in the manner appropriate for the
- * guest pagetable it's used in (gmfn) */
-{
- if ( !shadow2_mode_translate(v->domain) )
- return _mfn(gfn_x(gfn));
-
- if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_type_mask)
- != PGT_writable_page )
- return _mfn(gfn_x(gfn)); /* This is a paging-disabled shadow */
- else
- return sh2_gfn_to_mfn(v->domain, gfn_x(gfn));
-}
-
-
-int sh2_audit_l1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
-{
- guest_l1e_t *gl1e, *gp;
- shadow_l1e_t *sl1e;
- mfn_t mfn, gmfn, gl1mfn;
- gfn_t gfn;
- char *s;
- int done = 0;
-
- /* Follow the backpointer */
- gl1mfn = _mfn(mfn_to_page(sl1mfn)->u.inuse.type_info);
- gl1e = gp = sh2_map_domain_page(gl1mfn);
- SHADOW2_FOREACH_L1E(sl1mfn, sl1e, &gl1e, done, {
-
- s = sh2_audit_flags(v, 1, guest_l1e_get_flags(*gl1e),
- shadow_l1e_get_flags(*sl1e));
- if ( s ) AUDIT_FAIL(1, "%s", s);
-
- if ( SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES_MFNS )
- {
- gfn = guest_l1e_get_gfn(*gl1e);
- mfn = shadow_l1e_get_mfn(*sl1e);
- gmfn = audit_gfn_to_mfn(v, gfn, gl1mfn);
- if ( mfn_x(gmfn) != mfn_x(mfn) )
- AUDIT_FAIL(1, "bad translation: gfn %" SH2_PRI_gfn
- " --> %" SH2_PRI_mfn " != mfn %" SH2_PRI_mfn "\n",
- gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
- }
- });
- sh2_unmap_domain_page(gp);
- return done;
-}
-
-int sh2_audit_fl1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
-{
- guest_l1e_t *gl1e, e;
- shadow_l1e_t *sl1e;
- mfn_t gl1mfn = _mfn(INVALID_MFN);
- int f;
- int done = 0;
-
- /* fl1 has no useful backpointer: all we can check are flags */
- e = guest_l1e_from_gfn(_gfn(0), 0); gl1e = &e; /* Needed for macro */
- SHADOW2_FOREACH_L1E(sl1mfn, sl1e, 0, done, {
- f = shadow_l1e_get_flags(*sl1e);
- f &= ~(_PAGE_AVAIL0|_PAGE_AVAIL1|_PAGE_AVAIL2);
- if ( !(f == 0
- || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
- _PAGE_ACCESSED|_PAGE_DIRTY)
- || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_ACCESSED|_PAGE_DIRTY)) )
- AUDIT_FAIL(1, "fl1e has bad flags");
- });
- return 0;
-}
-
-int sh2_audit_l2_table(struct vcpu *v, mfn_t sl2mfn, mfn_t x)
-{
- guest_l2e_t *gl2e, *gp;
- shadow_l2e_t *sl2e;
- mfn_t mfn, gmfn, gl2mfn;
- gfn_t gfn;
- char *s;
- int done = 0;
-#if GUEST_PAGING_LEVELS != 4
- int xen_mappings = !shadow2_mode_external(v->domain);
-#endif
-
- /* Follow the backpointer */
- gl2mfn = _mfn(mfn_to_page(sl2mfn)->u.inuse.type_info);
- gl2e = gp = sh2_map_domain_page(gl2mfn);
- SHADOW2_FOREACH_L2E(sl2mfn, sl2e, &gl2e, done, xen_mappings, {
-
- s = sh2_audit_flags(v, 2, guest_l2e_get_flags(*gl2e),
- shadow_l2e_get_flags(*sl2e));
- if ( s ) AUDIT_FAIL(2, "%s", s);
-
- if ( SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES_MFNS )
- {
- gfn = guest_l2e_get_gfn(*gl2e);
- mfn = shadow_l2e_get_mfn(*sl2e);
- gmfn = (guest_l2e_get_flags(*gl2e) & _PAGE_PSE)
- ? get_fl1_shadow_status(v, gfn)
- : get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl2mfn),
- PGC_SH2_l1_shadow);
- if ( mfn_x(gmfn) != mfn_x(mfn) )
- AUDIT_FAIL(2, "bad translation: gfn %" SH2_PRI_gfn
- " (--> %" SH2_PRI_mfn ")"
- " --> %" SH2_PRI_mfn " != mfn %" SH2_PRI_mfn "\n",
- gfn_x(gfn),
- (guest_l2e_get_flags(*gl2e) & _PAGE_PSE) ? 0
- : mfn_x(audit_gfn_to_mfn(v, gfn, gl2mfn)),
- mfn_x(gmfn), mfn_x(mfn));
- }
- });
- sh2_unmap_domain_page(gp);
- return 0;
-}
-
-#if GUEST_PAGING_LEVELS >= 3
-int sh2_audit_l3_table(struct vcpu *v, mfn_t sl3mfn, mfn_t x)
-{
- guest_l3e_t *gl3e, *gp;
- shadow_l3e_t *sl3e;
- mfn_t mfn, gmfn, gl3mfn;
- gfn_t gfn;
- char *s;
- int done = 0;
-
- /* Follow the backpointer */
- gl3mfn = _mfn(mfn_to_page(sl3mfn)->u.inuse.type_info);
- gl3e = gp = sh2_map_domain_page(gl3mfn);
- SHADOW2_FOREACH_L3E(sl3mfn, sl3e, &gl3e, done, {
-
- s = sh2_audit_flags(v, 3, guest_l3e_get_flags(*gl3e),
- shadow_l3e_get_flags(*sl3e));
- if ( s ) AUDIT_FAIL(3, "%s", s);
-
- if ( SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES_MFNS )
- {
- gfn = guest_l3e_get_gfn(*gl3e);
- mfn = shadow_l3e_get_mfn(*sl3e);
- gmfn = get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl3mfn),
- (GUEST_PAGING_LEVELS == 3
- && !shadow2_mode_external(v->domain)
- && (guest_index(gl3e) % 4) == 3)
- ? PGC_SH2_l2h_pae_shadow
- : PGC_SH2_l2_shadow);
- if ( mfn_x(gmfn) != mfn_x(mfn) )
- AUDIT_FAIL(3, "bad translation: gfn %" SH2_PRI_gfn
- " --> %" SH2_PRI_mfn " != mfn %" SH2_PRI_mfn "\n",
- gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
- }
- });
- sh2_unmap_domain_page(gp);
- return 0;
-}
-#endif /* GUEST_PAGING_LEVELS >= 3 */
-
-#if GUEST_PAGING_LEVELS >= 4
-int sh2_audit_l4_table(struct vcpu *v, mfn_t sl4mfn, mfn_t x)
-{
- guest_l4e_t *gl4e, *gp;
- shadow_l4e_t *sl4e;
- mfn_t mfn, gmfn, gl4mfn;
- gfn_t gfn;
- char *s;
- int done = 0;
- int xen_mappings = !shadow2_mode_external(v->domain);
-
- /* Follow the backpointer */
- gl4mfn = _mfn(mfn_to_page(sl4mfn)->u.inuse.type_info);
- gl4e = gp = sh2_map_domain_page(gl4mfn);
- SHADOW2_FOREACH_L4E(sl4mfn, sl4e, &gl4e, done, xen_mappings,
- {
- s = sh2_audit_flags(v, 4, guest_l4e_get_flags(*gl4e),
- shadow_l4e_get_flags(*sl4e));
- if ( s ) AUDIT_FAIL(4, "%s", s);
-
- if ( SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES_MFNS )
- {
- gfn = guest_l4e_get_gfn(*gl4e);
- mfn = shadow_l4e_get_mfn(*sl4e);
- gmfn = get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl4mfn),
- PGC_SH2_l3_shadow);
- if ( mfn_x(gmfn) != mfn_x(mfn) )
- AUDIT_FAIL(4, "bad translation: gfn %" SH2_PRI_gfn
- " --> %" SH2_PRI_mfn " != mfn %" SH2_PRI_mfn "\n",
- gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
- }
- });
- sh2_unmap_domain_page(gp);
- return 0;
-}
-#endif /* GUEST_PAGING_LEVELS >= 4 */
-
-
-#undef AUDIT_FAIL
-
-#endif /* Audit code */
-
-/**************************************************************************/
-/* Entry points into this mode of the shadow code.
- * This will all be mangled by the preprocessor to uniquify everything. */
-struct shadow2_paging_mode sh2_paging_mode = {
- .page_fault = sh2_page_fault,
- .invlpg = sh2_invlpg,
- .gva_to_gpa = sh2_gva_to_gpa,
- .gva_to_gfn = sh2_gva_to_gfn,
- .update_cr3 = sh2_update_cr3,
- .map_and_validate_gl1e = sh2_map_and_validate_gl1e,
- .map_and_validate_gl2e = sh2_map_and_validate_gl2e,
- .map_and_validate_gl2he = sh2_map_and_validate_gl2he,
- .map_and_validate_gl3e = sh2_map_and_validate_gl3e,
- .map_and_validate_gl4e = sh2_map_and_validate_gl4e,
- .detach_old_tables = sh2_detach_old_tables,
- .x86_emulate_write = sh2_x86_emulate_write,
- .x86_emulate_cmpxchg = sh2_x86_emulate_cmpxchg,
- .x86_emulate_cmpxchg8b = sh2_x86_emulate_cmpxchg8b,
- .make_monitor_table = sh2_make_monitor_table,
- .destroy_monitor_table = sh2_destroy_monitor_table,
-#if SHADOW2_OPTIMIZATIONS & SH2OPT_WRITABLE_HEURISTIC
- .guess_wrmap = sh2_guess_wrmap,
-#endif
- .guest_levels = GUEST_PAGING_LEVELS,
- .shadow_levels = SHADOW_PAGING_LEVELS,
-};
-
-/*
- * Local variables:
- * mode: C
- * c-set-style: "BSD"
- * c-basic-offset: 4
- * indent-tabs-mode: nil
- * End:
- */
if ( unlikely(IN_HYPERVISOR_RANGE(addr)) )
{
- if ( shadow2_mode_external(d) && guest_mode(regs) )
- return shadow2_fault(addr, regs);
+ if ( shadow_mode_external(d) && guest_mode(regs) )
+ return shadow_fault(addr, regs);
if ( (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
return handle_gdt_ldt_mapping_fault(
addr - GDT_LDT_VIRT_START, regs);
ptwr_do_page_fault(d, addr, regs) )
return EXCRET_fault_fixed;
- if ( shadow2_mode_enabled(d) )
- return shadow2_fault(addr, regs);
+ if ( shadow_mode_enabled(d) )
+ return shadow_fault(addr, regs);
return 0;
}
struct shadow_domain {
u32 mode; /* flags to control shadow operation */
- spinlock_t lock; /* shadow2 domain lock */
+ spinlock_t lock; /* shadow domain lock */
int locker; /* processor which holds the lock */
const char *locker_function; /* Func that took it */
- struct list_head freelists[SHADOW2_MAX_ORDER + 1];
+ struct list_head freelists[SHADOW_MAX_ORDER + 1];
struct list_head p2m_freelist;
struct list_head p2m_inuse;
struct list_head toplevel_shadows;
unsigned int free_pages; /* number of pages on freelists */
unsigned int p2m_pages; /* number of pages in p2m map */
- /* Shadow2 hashtable */
- struct shadow2_hash_entry *hash_table;
- struct shadow2_hash_entry *hash_freelist;
- struct shadow2_hash_entry *hash_allocations;
+ /* Shadow hashtable */
+ struct shadow_hash_entry *hash_table;
+ struct shadow_hash_entry *hash_freelist;
+ struct shadow_hash_entry *hash_allocations;
int hash_walking; /* Some function is walking the hash table */
/* Shadow log-dirty bitmap */
/* Shadow-translated guest: Pseudophys base address of reserved area. */
unsigned long first_reserved_pfn;
- struct shadow_domain shadow2;
+ struct shadow_domain shadow;
/* Shadow translated domain: P2M mapping */
pagetable_t phys_table;
struct shadow_vcpu {
/* Pointers to mode-specific entry points. */
- struct shadow2_paging_mode *mode;
+ struct shadow_paging_mode *mode;
/* Last MFN that we emulated a write to. */
unsigned long last_emulated_mfn;
/* HVM guest: paging enabled (CR0.PG)? */
/* Current LDT details. */
unsigned long shadow_ldt_mapcnt;
- struct shadow_vcpu shadow2;
+ struct shadow_vcpu shadow;
} __cacheline_aligned;
/* shorthands to improve code legibility */
/* Each frame can be threaded onto a doubly-linked list. */
union {
struct list_head list;
- /* Shadow2 uses this field as an up-pointer in lower-level shadows */
+ /* Shadow uses this field as an up-pointer in lower-level shadows */
paddr_t up;
};
/* Only used on guest pages with a shadow.
* Guest pages with a shadow must have a non-zero type count, so this
* does not conflict with the tlbflush timestamp. */
- u32 shadow2_flags;
+ u32 shadow_flags;
// XXX -- we expect to add another field here, to be used for min/max
// purposes, which is only used for shadow pages.
#define PGT_ldt_page (6U<<29) /* using this page in an LDT? */
#define PGT_writable_page (7U<<29) /* has writable mappings of this page? */
-#ifndef SHADOW2
+#ifndef SHADOW
#define PGT_l1_shadow PGT_l1_page_table
#define PGT_l2_shadow PGT_l2_page_table
#define PGT_l3_shadow PGT_l3_page_table
/* 16-bit count of uses of this frame as its current type. */
#define PGT_count_mask ((1U<<16)-1)
-#ifndef SHADOW2
+#ifndef SHADOW
#ifdef __x86_64__
#define PGT_high_mfn_shift 52
#define PGT_high_mfn_mask (0xfffUL << PGT_high_mfn_shift)
#define PGT_score_shift 23
#define PGT_score_mask (((1U<<4)-1)<<PGT_score_shift)
#endif
-#endif /* SHADOW2 */
+#endif /* SHADOW */
/* Cleared when the owning guest 'frees' this page. */
#define _PGC_allocated 31
/* 29-bit count of references to this frame. */
#define PGC_count_mask ((1U<<29)-1)
-/* shadow2 uses the count_info on shadow pages somewhat differently */
-/* NB: please coordinate any changes here with the SH2F's in shadow2.h */
-#define PGC_SH2_none (0U<<28) /* on the shadow2 free list */
-#define PGC_SH2_min_shadow (1U<<28)
-#define PGC_SH2_l1_32_shadow (1U<<28) /* shadowing a 32-bit L1 guest page */
-#define PGC_SH2_fl1_32_shadow (2U<<28) /* L1 shadow for a 32b 4M superpage */
-#define PGC_SH2_l2_32_shadow (3U<<28) /* shadowing a 32-bit L2 guest page */
-#define PGC_SH2_l1_pae_shadow (4U<<28) /* shadowing a pae L1 page */
-#define PGC_SH2_fl1_pae_shadow (5U<<28) /* L1 shadow for pae 2M superpg */
-#define PGC_SH2_l2_pae_shadow (6U<<28) /* shadowing a pae L2-low page */
-#define PGC_SH2_l2h_pae_shadow (7U<<28) /* shadowing a pae L2-high page */
-#define PGC_SH2_l3_pae_shadow (8U<<28) /* shadowing a pae L3 page */
-#define PGC_SH2_l1_64_shadow (9U<<28) /* shadowing a 64-bit L1 page */
-#define PGC_SH2_fl1_64_shadow (10U<<28) /* L1 shadow for 64-bit 2M superpg */
-#define PGC_SH2_l2_64_shadow (11U<<28) /* shadowing a 64-bit L2 page */
-#define PGC_SH2_l3_64_shadow (12U<<28) /* shadowing a 64-bit L3 page */
-#define PGC_SH2_l4_64_shadow (13U<<28) /* shadowing a 64-bit L4 page */
-#define PGC_SH2_max_shadow (13U<<28)
-#define PGC_SH2_p2m_table (14U<<28) /* in use as the p2m table */
-#define PGC_SH2_monitor_table (15U<<28) /* in use as a monitor table */
-#define PGC_SH2_unused (15U<<28)
-
-#define PGC_SH2_type_mask (15U<<28)
-#define PGC_SH2_type_shift 28
-
-#define PGC_SH2_pinned (1U<<27)
-
-#define _PGC_SH2_log_dirty 26
-#define PGC_SH2_log_dirty (1U<<26)
+/* shadow uses the count_info on shadow pages somewhat differently */
+/* NB: please coordinate any changes here with the SHF's in shadow.h */
+#define PGC_SH_none (0U<<28) /* on the shadow free list */
+#define PGC_SH_min_shadow (1U<<28)
+#define PGC_SH_l1_32_shadow (1U<<28) /* shadowing a 32-bit L1 guest page */
+#define PGC_SH_fl1_32_shadow (2U<<28) /* L1 shadow for a 32b 4M superpage */
+#define PGC_SH_l2_32_shadow (3U<<28) /* shadowing a 32-bit L2 guest page */
+#define PGC_SH_l1_pae_shadow (4U<<28) /* shadowing a pae L1 page */
+#define PGC_SH_fl1_pae_shadow (5U<<28) /* L1 shadow for pae 2M superpg */
+#define PGC_SH_l2_pae_shadow (6U<<28) /* shadowing a pae L2-low page */
+#define PGC_SH_l2h_pae_shadow (7U<<28) /* shadowing a pae L2-high page */
+#define PGC_SH_l3_pae_shadow (8U<<28) /* shadowing a pae L3 page */
+#define PGC_SH_l1_64_shadow (9U<<28) /* shadowing a 64-bit L1 page */
+#define PGC_SH_fl1_64_shadow (10U<<28) /* L1 shadow for 64-bit 2M superpg */
+#define PGC_SH_l2_64_shadow (11U<<28) /* shadowing a 64-bit L2 page */
+#define PGC_SH_l3_64_shadow (12U<<28) /* shadowing a 64-bit L3 page */
+#define PGC_SH_l4_64_shadow (13U<<28) /* shadowing a 64-bit L4 page */
+#define PGC_SH_max_shadow (13U<<28)
+#define PGC_SH_p2m_table (14U<<28) /* in use as the p2m table */
+#define PGC_SH_monitor_table (15U<<28) /* in use as a monitor table */
+#define PGC_SH_unused (15U<<28)
+
+#define PGC_SH_type_mask (15U<<28)
+#define PGC_SH_type_shift 28
+
+#define PGC_SH_pinned (1U<<27)
+
+#define _PGC_SH_log_dirty 26
+#define PGC_SH_log_dirty (1U<<26)
/* 26 bit ref count for shadow pages */
-#define PGC_SH2_count_mask ((1U<<26) - 1)
+#define PGC_SH_count_mask ((1U<<26) - 1)
/* We trust the slab allocator in slab.c, and our use of it. */
#define PageSlab(page) (1)
/* The order of the largest allocation unit we use for shadow pages */
#if CONFIG_PAGING_LEVELS == 2
-#define SHADOW2_MAX_ORDER 0 /* Only ever need 4k allocations */
+#define SHADOW_MAX_ORDER 0 /* Only ever need 4k allocations */
#else
-#define SHADOW2_MAX_ORDER 2 /* Need up to 16k allocs for 32-bit on PAE/64 */
+#define SHADOW_MAX_ORDER 2 /* Need up to 16k allocs for 32-bit on PAE/64 */
#endif
#define page_get_owner(_p) (unpickle_domptr((_p)->u.inuse._domain))
extern int shadow_remove_all_write_access(
struct domain *d, unsigned long gmfn, unsigned long mfn);
extern u32 shadow_remove_all_access( struct domain *d, unsigned long gmfn);
-extern int _shadow2_mode_refcounts(struct domain *d);
+extern int _shadow_mode_refcounts(struct domain *d);
static inline void put_page(struct page_info *page)
{
unlikely((nx & PGC_count_mask) == 0) || /* Count overflow? */
unlikely(d != _domain) ) /* Wrong owner? */
{
- if ( !_shadow2_mode_refcounts(domain) )
+ if ( !_shadow_mode_refcounts(domain) )
DPRINTK("Error pfn %lx: rd=%p, od=%p, caf=%08x, taf=%"
PRtype_info "\n",
page_to_mfn(page), domain, unpickle_domptr(d),
#define mfn_to_gmfn(_d, mfn) \
- ( (shadow2_mode_translate(_d)) \
+ ( (shadow_mode_translate(_d)) \
? get_gpfn_from_mfn(mfn) \
: (mfn) )
-#define gmfn_to_mfn(_d, gpfn) mfn_x(sh2_gfn_to_mfn(_d, gpfn))
+#define gmfn_to_mfn(_d, gpfn) mfn_x(sh_gfn_to_mfn(_d, gpfn))
/*
+++ /dev/null
-
-#ifndef __X86_PAGE_GUEST_H__
-#define __X86_PAGE_GUEST_H__
-
-#ifndef __ASSEMBLY__
-# include <asm/types.h>
-#endif
-
-#define PAGETABLE_ORDER_32 10
-#define L1_PAGETABLE_ENTRIES_32 (1<<PAGETABLE_ORDER_32)
-#define L2_PAGETABLE_ENTRIES_32 (1<<PAGETABLE_ORDER_32)
-#define ROOT_PAGETABLE_ENTRIES_32 L2_PAGETABLE_ENTRIES_32
-
-
-#define L1_PAGETABLE_SHIFT_32 12
-#define L2_PAGETABLE_SHIFT_32 22
-
-/* Extract flags into 12-bit integer, or turn 12-bit flags into a pte mask. */
-
-#ifndef __ASSEMBLY__
-
-typedef u32 intpte_32_t;
-
-typedef struct { intpte_32_t l1; } l1_pgentry_32_t;
-typedef struct { intpte_32_t l2; } l2_pgentry_32_t;
-typedef l2_pgentry_t root_pgentry_32_t;
-#endif
-
-#define get_pte_flags_32(x) ((u32)(x) & 0xFFF)
-#define put_pte_flags_32(x) ((intpte_32_t)(x))
-
-/* Get pte access flags (unsigned int). */
-#define l1e_get_flags_32(x) (get_pte_flags_32((x).l1))
-#define l2e_get_flags_32(x) (get_pte_flags_32((x).l2))
-
-#define l1e_get_paddr_32(x) \
- ((paddr_t)(((x).l1 & (PADDR_MASK&PAGE_MASK))))
-#define l2e_get_paddr_32(x) \
- ((paddr_t)(((x).l2 & (PADDR_MASK&PAGE_MASK))))
-
-/* Construct an empty pte. */
-#define l1e_empty_32() ((l1_pgentry_32_t) { 0 })
-#define l2e_empty_32() ((l2_pgentry_32_t) { 0 })
-
-/* Construct a pte from a pfn and access flags. */
-#define l1e_from_pfn_32(pfn, flags) \
- ((l1_pgentry_32_t) { ((intpte_32_t)(pfn) << PAGE_SHIFT) | put_pte_flags_32(flags) })
-#define l2e_from_pfn_32(pfn, flags) \
- ((l2_pgentry_32_t) { ((intpte_32_t)(pfn) << PAGE_SHIFT) | put_pte_flags_32(flags) })
-
-/* Construct a pte from a physical address and access flags. */
-#ifndef __ASSEMBLY__
-static inline l1_pgentry_32_t l1e_from_paddr_32(paddr_t pa, unsigned int flags)
-{
- ASSERT((pa & ~(PADDR_MASK & PAGE_MASK)) == 0);
- return (l1_pgentry_32_t) { pa | put_pte_flags_32(flags) };
-}
-static inline l2_pgentry_32_t l2e_from_paddr_32(paddr_t pa, unsigned int flags)
-{
- ASSERT((pa & ~(PADDR_MASK & PAGE_MASK)) == 0);
- return (l2_pgentry_32_t) { pa | put_pte_flags_32(flags) };
-}
-#endif /* !__ASSEMBLY__ */
-
-
-/* Construct a pte from a page pointer and access flags. */
-#define l1e_from_page_32(page, flags) (l1e_from_pfn_32(page_to_mfn(page),(flags)))
-#define l2e_from_page_32(page, flags) (l2e_from_pfn_32(page_to_mfn(page),(flags)))
-
-/* Add extra flags to an existing pte. */
-#define l1e_add_flags_32(x, flags) ((x).l1 |= put_pte_flags_32(flags))
-#define l2e_add_flags_32(x, flags) ((x).l2 |= put_pte_flags_32(flags))
-
-/* Remove flags from an existing pte. */
-#define l1e_remove_flags_32(x, flags) ((x).l1 &= ~put_pte_flags_32(flags))
-#define l2e_remove_flags_32(x, flags) ((x).l2 &= ~put_pte_flags_32(flags))
-
-/* Check if a pte's page mapping or significant access flags have changed. */
-#define l1e_has_changed_32(x,y,flags) \
- ( !!(((x).l1 ^ (y).l1) & ((PADDR_MASK&PAGE_MASK)|put_pte_flags_32(flags))) )
-#define l2e_has_changed_32(x,y,flags) \
- ( !!(((x).l2 ^ (y).l2) & ((PADDR_MASK&PAGE_MASK)|put_pte_flags_32(flags))) )
-
-/* Given a virtual address, get an entry offset into a page table. */
-#define l1_table_offset_32(a) \
- (((a) >> L1_PAGETABLE_SHIFT_32) & (L1_PAGETABLE_ENTRIES_32 - 1))
-#define l2_table_offset_32(a) \
- (((a) >> L2_PAGETABLE_SHIFT_32) & (L2_PAGETABLE_ENTRIES_32 - 1))
-
-#define linear_l1_table_32 \
- ((l1_pgentry_32_t *)(LINEAR_PT_VIRT_START))
-
-#define linear_pg_table_32 linear_l1_table_32
-
-#endif /* __X86_PAGE_GUEST_H__ */
-
-/*
- * Local variables:
- * mode: C
- * c-set-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
PERFCOUNTER_CPU(exception_fixed, "pre-exception fixed")
-/* Shadow2 counters */
-PERFCOUNTER_CPU(shadow2_alloc, "calls to shadow2_alloc")
-PERFCOUNTER_CPU(shadow2_alloc_tlbflush, "shadow2_alloc flushed TLBs")
+/* Shadow counters */
+PERFCOUNTER_CPU(shadow_alloc, "calls to shadow_alloc")
+PERFCOUNTER_CPU(shadow_alloc_tlbflush, "shadow_alloc flushed TLBs")
/* STATUS counters do not reset when 'P' is hit */
-PERFSTATUS(shadow2_alloc_count, "number of shadow pages in use")
-PERFCOUNTER_CPU(shadow2_free, "calls to shadow2_free")
-PERFCOUNTER_CPU(shadow2_prealloc_1, "shadow2 recycles old shadows")
-PERFCOUNTER_CPU(shadow2_prealloc_2, "shadow2 recycles in-use shadows")
-PERFCOUNTER_CPU(shadow2_linear_map_failed, "shadow2 hit read-only linear map")
-PERFCOUNTER_CPU(shadow2_a_update, "shadow2 A bit update")
-PERFCOUNTER_CPU(shadow2_ad_update, "shadow2 A&D bit update")
-PERFCOUNTER_CPU(shadow2_fault, "calls to shadow2_fault")
-PERFCOUNTER_CPU(shadow2_fault_bail_bad_gfn, "shadow2_fault guest bad gfn")
-PERFCOUNTER_CPU(shadow2_fault_bail_not_present,
- "shadow2_fault guest not-present")
-PERFCOUNTER_CPU(shadow2_fault_bail_nx, "shadow2_fault guest NX fault")
-PERFCOUNTER_CPU(shadow2_fault_bail_ro_mapping, "shadow2_fault guest R/W fault")
-PERFCOUNTER_CPU(shadow2_fault_bail_user_supervisor,
- "shadow2_fault guest U/S fault")
-PERFCOUNTER_CPU(shadow2_fault_emulate_read, "shadow2_fault emulates a read")
-PERFCOUNTER_CPU(shadow2_fault_emulate_write, "shadow2_fault emulates a write")
-PERFCOUNTER_CPU(shadow2_fault_emulate_failed, "shadow2_fault emulator fails")
-PERFCOUNTER_CPU(shadow2_fault_mmio, "shadow2_fault handled as mmio")
-PERFCOUNTER_CPU(shadow2_fault_fixed, "shadow2_fault fixed fault")
-PERFCOUNTER_CPU(shadow2_ptwr_emulate, "shadow2 causes ptwr to emulate")
-PERFCOUNTER_CPU(shadow2_validate_gl1e_calls, "calls to shadow2_validate_gl1e")
-PERFCOUNTER_CPU(shadow2_validate_gl2e_calls, "calls to shadow2_validate_gl2e")
-PERFCOUNTER_CPU(shadow2_validate_gl3e_calls, "calls to shadow2_validate_gl3e")
-PERFCOUNTER_CPU(shadow2_validate_gl4e_calls, "calls to shadow2_validate_gl4e")
-PERFCOUNTER_CPU(shadow2_hash_lookups, "calls to shadow2_hash_lookup")
-PERFCOUNTER_CPU(shadow2_hash_lookup_head, "shadow2 hash hit in bucket head")
-PERFCOUNTER_CPU(shadow2_hash_lookup_miss, "shadow2 hash misses")
-PERFCOUNTER_CPU(shadow2_get_shadow_status, "calls to get_shadow_status")
-PERFCOUNTER_CPU(shadow2_hash_inserts, "calls to shadow2_hash_insert")
-PERFCOUNTER_CPU(shadow2_hash_deletes, "calls to shadow2_hash_delete")
-PERFCOUNTER_CPU(shadow2_writeable, "shadow2 removes write access")
-PERFCOUNTER_CPU(shadow2_writeable_h_1, "shadow2 writeable: 32b w2k3")
-PERFCOUNTER_CPU(shadow2_writeable_h_2, "shadow2 writeable: 32pae w2k3")
-PERFCOUNTER_CPU(shadow2_writeable_h_3, "shadow2 writeable: 64b w2k3")
-PERFCOUNTER_CPU(shadow2_writeable_h_4, "shadow2 writeable: 32b linux low")
-PERFCOUNTER_CPU(shadow2_writeable_bf, "shadow2 writeable brute-force")
-PERFCOUNTER_CPU(shadow2_mappings, "shadow2 removes all mappings")
-PERFCOUNTER_CPU(shadow2_mappings_bf, "shadow2 rm-mappings brute-force")
-PERFCOUNTER_CPU(shadow2_early_unshadow, "shadow2 unshadows for fork/exit")
-PERFCOUNTER_CPU(shadow2_early_unshadow_top, "shadow2 unhooks for fork/exit")
-PERFCOUNTER_CPU(shadow2_unshadow, "shadow2 unshadows a page")
-PERFCOUNTER_CPU(shadow2_up_pointer, "shadow2 unshadow by up-pointer")
-PERFCOUNTER_CPU(shadow2_unshadow_bf, "shadow2 unshadow brute-force")
-PERFCOUNTER_CPU(shadow2_get_page_fail, "shadow2_get_page_from_l1e failed")
-PERFCOUNTER_CPU(shadow2_guest_walk, "shadow2 walks guest tables")
-PERFCOUNTER_CPU(shadow2_walk_cache_hit, "shadow2 walk-cache hits")
-PERFCOUNTER_CPU(shadow2_walk_cache_miss, "shadow2 walk-cache misses")
+PERFSTATUS(shadow_alloc_count, "number of shadow pages in use")
+PERFCOUNTER_CPU(shadow_free, "calls to shadow_free")
+PERFCOUNTER_CPU(shadow_prealloc_1, "shadow recycles old shadows")
+PERFCOUNTER_CPU(shadow_prealloc_2, "shadow recycles in-use shadows")
+PERFCOUNTER_CPU(shadow_linear_map_failed, "shadow hit read-only linear map")
+PERFCOUNTER_CPU(shadow_a_update, "shadow A bit update")
+PERFCOUNTER_CPU(shadow_ad_update, "shadow A&D bit update")
+PERFCOUNTER_CPU(shadow_fault, "calls to shadow_fault")
+PERFCOUNTER_CPU(shadow_fault_bail_bad_gfn, "shadow_fault guest bad gfn")
+PERFCOUNTER_CPU(shadow_fault_bail_not_present,
+ "shadow_fault guest not-present")
+PERFCOUNTER_CPU(shadow_fault_bail_nx, "shadow_fault guest NX fault")
+PERFCOUNTER_CPU(shadow_fault_bail_ro_mapping, "shadow_fault guest R/W fault")
+PERFCOUNTER_CPU(shadow_fault_bail_user_supervisor,
+ "shadow_fault guest U/S fault")
+PERFCOUNTER_CPU(shadow_fault_emulate_read, "shadow_fault emulates a read")
+PERFCOUNTER_CPU(shadow_fault_emulate_write, "shadow_fault emulates a write")
+PERFCOUNTER_CPU(shadow_fault_emulate_failed, "shadow_fault emulator fails")
+PERFCOUNTER_CPU(shadow_fault_mmio, "shadow_fault handled as mmio")
+PERFCOUNTER_CPU(shadow_fault_fixed, "shadow_fault fixed fault")
+PERFCOUNTER_CPU(shadow_ptwr_emulate, "shadow causes ptwr to emulate")
+PERFCOUNTER_CPU(shadow_validate_gl1e_calls, "calls to shadow_validate_gl1e")
+PERFCOUNTER_CPU(shadow_validate_gl2e_calls, "calls to shadow_validate_gl2e")
+PERFCOUNTER_CPU(shadow_validate_gl3e_calls, "calls to shadow_validate_gl3e")
+PERFCOUNTER_CPU(shadow_validate_gl4e_calls, "calls to shadow_validate_gl4e")
+PERFCOUNTER_CPU(shadow_hash_lookups, "calls to shadow_hash_lookup")
+PERFCOUNTER_CPU(shadow_hash_lookup_head, "shadow hash hit in bucket head")
+PERFCOUNTER_CPU(shadow_hash_lookup_miss, "shadow hash misses")
+PERFCOUNTER_CPU(shadow_get_shadow_status, "calls to get_shadow_status")
+PERFCOUNTER_CPU(shadow_hash_inserts, "calls to shadow_hash_insert")
+PERFCOUNTER_CPU(shadow_hash_deletes, "calls to shadow_hash_delete")
+PERFCOUNTER_CPU(shadow_writeable, "shadow removes write access")
+PERFCOUNTER_CPU(shadow_writeable_h_1, "shadow writeable: 32b w2k3")
+PERFCOUNTER_CPU(shadow_writeable_h_2, "shadow writeable: 32pae w2k3")
+PERFCOUNTER_CPU(shadow_writeable_h_3, "shadow writeable: 64b w2k3")
+PERFCOUNTER_CPU(shadow_writeable_h_4, "shadow writeable: 32b linux low")
+PERFCOUNTER_CPU(shadow_writeable_bf, "shadow writeable brute-force")
+PERFCOUNTER_CPU(shadow_mappings, "shadow removes all mappings")
+PERFCOUNTER_CPU(shadow_mappings_bf, "shadow rm-mappings brute-force")
+PERFCOUNTER_CPU(shadow_early_unshadow, "shadow unshadows for fork/exit")
+PERFCOUNTER_CPU(shadow_early_unshadow_top, "shadow unhooks for fork/exit")
+PERFCOUNTER_CPU(shadow_unshadow, "shadow unshadows a page")
+PERFCOUNTER_CPU(shadow_up_pointer, "shadow unshadow by up-pointer")
+PERFCOUNTER_CPU(shadow_unshadow_bf, "shadow unshadow brute-force")
+PERFCOUNTER_CPU(shadow_get_page_fail, "shadow_get_page_from_l1e failed")
+PERFCOUNTER_CPU(shadow_guest_walk, "shadow walks guest tables")
+PERFCOUNTER_CPU(shadow_walk_cache_hit, "shadow walk-cache hits")
+PERFCOUNTER_CPU(shadow_walk_cache_miss, "shadow walk-cache misses")
/*#endif*/ /* __XEN_PERFC_DEFN_H__ */
/******************************************************************************
* include/asm-x86/shadow.h
*
- * Copyright (c) 2006 by XenSource Inc.
+ * Parts of this code are Copyright (c) 2006 by XenSource Inc.
+ * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
+ * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
#ifndef _XEN_SHADOW_H
#define _XEN_SHADOW_H
-/* This file is just a wrapper around the new Shadow2 header,
- * providing names that must be defined in any shadow implementation. */
-
-#include <asm/shadow2.h>
+#include <public/domctl.h>
+#include <xen/sched.h>
+#include <xen/perfc.h>
+#include <asm/flushtlb.h>
/* How to make sure a page is not referred to in a shadow PT */
/* This will need to be a for_each_vcpu if we go to per-vcpu shadows */
#define shadow_drop_references(_d, _p) \
- shadow2_remove_all_mappings((_d)->vcpu[0], _mfn(page_to_mfn(_p)))
+ shadow_remove_all_mappings((_d)->vcpu[0], _mfn(page_to_mfn(_p)))
#define shadow_sync_and_drop_references(_d, _p) \
- shadow2_remove_all_mappings((_d)->vcpu[0], _mfn(page_to_mfn(_p)))
-
-/* Whether we are translating the domain's frame numbers for it */
-#define shadow_mode_translate(d) shadow2_mode_translate(d)
+ shadow_remove_all_mappings((_d)->vcpu[0], _mfn(page_to_mfn(_p)))
-/* ...and if so, how to add and remove entries in the mapping */
+/* How to add and remove entries in the p2m mapping. */
#define guest_physmap_add_page(_d, _p, _m) \
- shadow2_guest_physmap_add_page((_d), (_p), (_m))
+ shadow_guest_physmap_add_page((_d), (_p), (_m))
#define guest_physmap_remove_page(_d, _p, _m ) \
- shadow2_guest_physmap_remove_page((_d), (_p), (_m))
+ shadow_guest_physmap_remove_page((_d), (_p), (_m))
+
+/* Shadow PT operation mode : shadow-mode variable in arch_domain. */
+
+#define SHM2_shift 10
+/* We're in one of the shadow modes */
+#define SHM2_enable (1U << SHM2_shift)
+/* Refcounts based on shadow tables instead of guest tables */
+#define SHM2_refcounts (XEN_DOMCTL_SHADOW_ENABLE_REFCOUNT << SHM2_shift)
+/* Enable log dirty mode */
+#define SHM2_log_dirty (XEN_DOMCTL_SHADOW_ENABLE_LOG_DIRTY << SHM2_shift)
+/* Xen does p2m translation, not guest */
+#define SHM2_translate (XEN_DOMCTL_SHADOW_ENABLE_TRANSLATE << SHM2_shift)
+/* Xen does not steal address space from the domain for its own booking;
+ * requires VT or similar mechanisms */
+#define SHM2_external (XEN_DOMCTL_SHADOW_ENABLE_EXTERNAL << SHM2_shift)
+
+#define shadow_mode_enabled(_d) ((_d)->arch.shadow.mode)
+#define shadow_mode_refcounts(_d) ((_d)->arch.shadow.mode & SHM2_refcounts)
+#define shadow_mode_log_dirty(_d) ((_d)->arch.shadow.mode & SHM2_log_dirty)
+#define shadow_mode_translate(_d) ((_d)->arch.shadow.mode & SHM2_translate)
+#define shadow_mode_external(_d) ((_d)->arch.shadow.mode & SHM2_external)
+
+/* Xen traps & emulates all reads of all page table pages:
+ *not yet supported
+ */
+#define shadow_mode_trap_reads(_d) ({ (void)(_d); 0; })
+
+// flags used in the return value of the shadow_set_lXe() functions...
+#define SHADOW_SET_CHANGED 0x1
+#define SHADOW_SET_FLUSH 0x2
+#define SHADOW_SET_ERROR 0x4
+#define SHADOW_SET_L3PAE_RECOPY 0x8
+
+// How do we tell that we have a 32-bit PV guest in a 64-bit Xen?
+#ifdef __x86_64__
+#define pv_32bit_guest(_v) 0 // not yet supported
+#else
+#define pv_32bit_guest(_v) !hvm_guest(v)
+#endif
+
+/* The shadow lock.
+ *
+ * This lock is per-domain. It is intended to allow us to make atomic
+ * updates to the software TLB that the shadow tables provide.
+ *
+ * Specifically, it protects:
+ * - all changes to shadow page table pages
+ * - the shadow hash table
+ * - the shadow page allocator
+ * - all changes to guest page table pages; if/when the notion of
+ * out-of-sync pages is added to this code, then the shadow lock is
+ * protecting all guest page table pages which are not listed as
+ * currently as both guest-writable and out-of-sync...
+ * XXX -- need to think about this relative to writable page tables.
+ * - all changes to the page_info->tlbflush_timestamp
+ * - the page_info->count fields on shadow pages
+ * - the shadow dirty bit array and count
+ * - XXX
+ */
+#ifndef CONFIG_SMP
+#error shadow.h currently requires CONFIG_SMP
+#endif
+
+#define shadow_lock_init(_d) \
+ do { \
+ spin_lock_init(&(_d)->arch.shadow.lock); \
+ (_d)->arch.shadow.locker = -1; \
+ (_d)->arch.shadow.locker_function = "nobody"; \
+ } while (0)
+
+#define shadow_lock_is_acquired(_d) \
+ (current->processor == (_d)->arch.shadow.locker)
+
+#define shadow_lock(_d) \
+ do { \
+ if ( unlikely((_d)->arch.shadow.locker == current->processor) ) \
+ { \
+ printk("Error: shadow lock held by %s\n", \
+ (_d)->arch.shadow.locker_function); \
+ BUG(); \
+ } \
+ spin_lock(&(_d)->arch.shadow.lock); \
+ ASSERT((_d)->arch.shadow.locker == -1); \
+ (_d)->arch.shadow.locker = current->processor; \
+ (_d)->arch.shadow.locker_function = __func__; \
+ } while (0)
+
+#define shadow_unlock(_d) \
+ do { \
+ ASSERT((_d)->arch.shadow.locker == current->processor); \
+ (_d)->arch.shadow.locker = -1; \
+ (_d)->arch.shadow.locker_function = "nobody"; \
+ spin_unlock(&(_d)->arch.shadow.lock); \
+ } while (0)
+
+/*
+ * Levels of self-test and paranoia
+ * XXX should go in config files somewhere?
+ */
+#define SHADOW_AUDIT_HASH 0x01 /* Check current hash bucket */
+#define SHADOW_AUDIT_HASH_FULL 0x02 /* Check every hash bucket */
+#define SHADOW_AUDIT_ENTRIES 0x04 /* Check this walk's shadows */
+#define SHADOW_AUDIT_ENTRIES_FULL 0x08 /* Check every shadow */
+#define SHADOW_AUDIT_ENTRIES_MFNS 0x10 /* Check gfn-mfn map in shadows */
+#define SHADOW_AUDIT_P2M 0x20 /* Check the p2m table */
+
+#ifdef NDEBUG
+#define SHADOW_AUDIT 0
+#define SHADOW_AUDIT_ENABLE 0
+#else
+#define SHADOW_AUDIT 0x15 /* Basic audit of all except p2m. */
+#define SHADOW_AUDIT_ENABLE shadow_audit_enable
+extern int shadow_audit_enable;
+#endif
+
+/*
+ * Levels of optimization
+ * XXX should go in config files somewhere?
+ */
+#define SHOPT_WRITABLE_HEURISTIC 0x01 /* Guess at RW PTEs via linear maps */
+#define SHOPT_EARLY_UNSHADOW 0x02 /* Unshadow l1s on fork or exit */
+
+#define SHADOW_OPTIMIZATIONS 0x03
+
+
+/* With shadow pagetables, the different kinds of address start
+ * to get get confusing.
+ *
+ * Virtual addresses are what they usually are: the addresses that are used
+ * to accessing memory while the guest is running. The MMU translates from
+ * virtual addresses to machine addresses.
+ *
+ * (Pseudo-)physical addresses are the abstraction of physical memory the
+ * guest uses for allocation and so forth. For the purposes of this code,
+ * we can largely ignore them.
+ *
+ * Guest frame numbers (gfns) are the entries that the guest puts in its
+ * pagetables. For normal paravirtual guests, they are actual frame numbers,
+ * with the translation done by the guest.
+ *
+ * Machine frame numbers (mfns) are the entries that the hypervisor puts
+ * in the shadow page tables.
+ *
+ * Elsewhere in the xen code base, the name "gmfn" is generally used to refer
+ * to a "machine frame number, from the guest's perspective", or in other
+ * words, pseudo-physical frame numbers. However, in the shadow code, the
+ * term "gmfn" means "the mfn of a guest page"; this combines naturally with
+ * other terms such as "smfn" (the mfn of a shadow page), gl2mfn (the mfn of a
+ * guest L2 page), etc...
+ */
+
+/* With this defined, we do some ugly things to force the compiler to
+ * give us type safety between mfns and gfns and other integers.
+ * TYPE_SAFE(int foo) defines a foo_t, and _foo() and foo_x() functions
+ * that translate beween int and foo_t.
+ *
+ * It does have some performance cost because the types now have
+ * a different storage attribute, so may not want it on all the time. */
+#ifndef NDEBUG
+#define TYPE_SAFETY 1
+#endif
+
+#ifdef TYPE_SAFETY
+#define TYPE_SAFE(_type,_name) \
+typedef struct { _type _name; } _name##_t; \
+static inline _name##_t _##_name(_type n) { return (_name##_t) { n }; } \
+static inline _type _name##_x(_name##_t n) { return n._name; }
+#else
+#define TYPE_SAFE(_type,_name) \
+typedef _type _name##_t; \
+static inline _name##_t _##_name(_type n) { return n; } \
+static inline _type _name##_x(_name##_t n) { return n; }
+#endif
+
+TYPE_SAFE(unsigned long,mfn)
+#define SH_PRI_mfn "05lx"
+
+static inline int
+valid_mfn(mfn_t m)
+{
+ return VALID_MFN(mfn_x(m));
+}
+
+static inline mfn_t
+pagetable_get_mfn(pagetable_t pt)
+{
+ return _mfn(pagetable_get_pfn(pt));
+}
+
+static inline pagetable_t
+pagetable_from_mfn(mfn_t mfn)
+{
+ return pagetable_from_pfn(mfn_x(mfn));
+}
+
+static inline int
+shadow_vcpu_mode_translate(struct vcpu *v)
+{
+ // Returns true if this VCPU needs to be using the P2M table to translate
+ // between GFNs and MFNs.
+ //
+ // This is true of translated HVM domains on a vcpu which has paging
+ // enabled. (HVM vcpu's with paging disabled are using the p2m table as
+ // its paging table, so no translation occurs in this case.)
+ //
+ return v->arch.shadow.hvm_paging_enabled;
+}
+
+
+/**************************************************************************/
+/* Mode-specific entry points into the shadow code */
+
+struct x86_emulate_ctxt;
+struct shadow_paging_mode {
+ int (*page_fault )(struct vcpu *v, unsigned long va,
+ struct cpu_user_regs *regs);
+ int (*invlpg )(struct vcpu *v, unsigned long va);
+ unsigned long (*gva_to_gpa )(struct vcpu *v, unsigned long va);
+ unsigned long (*gva_to_gfn )(struct vcpu *v, unsigned long va);
+ void (*update_cr3 )(struct vcpu *v);
+ int (*map_and_validate_gl1e )(struct vcpu *v, mfn_t gmfn,
+ void *new_guest_entry, u32 size);
+ int (*map_and_validate_gl2e )(struct vcpu *v, mfn_t gmfn,
+ void *new_guest_entry, u32 size);
+ int (*map_and_validate_gl2he)(struct vcpu *v, mfn_t gmfn,
+ void *new_guest_entry, u32 size);
+ int (*map_and_validate_gl3e )(struct vcpu *v, mfn_t gmfn,
+ void *new_guest_entry, u32 size);
+ int (*map_and_validate_gl4e )(struct vcpu *v, mfn_t gmfn,
+ void *new_guest_entry, u32 size);
+ void (*detach_old_tables )(struct vcpu *v);
+ int (*x86_emulate_write )(struct vcpu *v, unsigned long va,
+ void *src, u32 bytes,
+ struct x86_emulate_ctxt *ctxt);
+ int (*x86_emulate_cmpxchg )(struct vcpu *v, unsigned long va,
+ unsigned long old,
+ unsigned long new,
+ unsigned int bytes,
+ struct x86_emulate_ctxt *ctxt);
+ int (*x86_emulate_cmpxchg8b )(struct vcpu *v, unsigned long va,
+ unsigned long old_lo,
+ unsigned long old_hi,
+ unsigned long new_lo,
+ unsigned long new_hi,
+ struct x86_emulate_ctxt *ctxt);
+ mfn_t (*make_monitor_table )(struct vcpu *v);
+ void (*destroy_monitor_table )(struct vcpu *v, mfn_t mmfn);
+#if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
+ int (*guess_wrmap )(struct vcpu *v,
+ unsigned long vaddr, mfn_t gmfn);
+#endif
+ /* For outsiders to tell what mode we're in */
+ unsigned int shadow_levels;
+ unsigned int guest_levels;
+};
+
+static inline int shadow_guest_paging_levels(struct vcpu *v)
+{
+ ASSERT(v->arch.shadow.mode != NULL);
+ return v->arch.shadow.mode->guest_levels;
+}
+
+/**************************************************************************/
+/* Entry points into the shadow code */
+
+/* Turning on shadow test mode */
+int shadow_test_enable(struct domain *d);
+
+/* Handler for shadow control ops: enabling and disabling shadow modes,
+ * and log-dirty bitmap ops all happen through here. */
+int shadow_domctl(struct domain *d,
+ xen_domctl_shadow_op_t *sc,
+ XEN_GUEST_HANDLE(xen_domctl_t) u_domctl);
+
+/* Call when destroying a domain */
+void shadow_teardown(struct domain *d);
+
+/* Call once all of the references to the domain have gone away */
+void shadow_final_teardown(struct domain *d);
+
+
+/* Mark a page as dirty in the bitmap */
+void sh_do_mark_dirty(struct domain *d, mfn_t gmfn);
+static inline void mark_dirty(struct domain *d, unsigned long gmfn)
+{
+ if ( shadow_mode_log_dirty(d) )
+ {
+ shadow_lock(d);
+ sh_do_mark_dirty(d, _mfn(gmfn));
+ shadow_unlock(d);
+ }
+}
+
+/* Internal version, for when the shadow lock is already held */
+static inline void sh_mark_dirty(struct domain *d, mfn_t gmfn)
+{
+ ASSERT(shadow_lock_is_acquired(d));
+ if ( shadow_mode_log_dirty(d) )
+ sh_do_mark_dirty(d, gmfn);
+}
+
+static inline int
+shadow_fault(unsigned long va, struct cpu_user_regs *regs)
+/* Called from pagefault handler in Xen, and from the HVM trap handlers
+ * for pagefaults. Returns 1 if this fault was an artefact of the
+ * shadow code (and the guest should retry) or 0 if it is not (and the
+ * fault should be handled elsewhere or passed to the guest). */
+{
+ struct vcpu *v = current;
+ perfc_incrc(shadow_fault);
+ return v->arch.shadow.mode->page_fault(v, va, regs);
+}
+
+static inline int
+shadow_invlpg(struct vcpu *v, unsigned long va)
+/* Called when the guest requests an invlpg. Returns 1 if the invlpg
+ * instruction should be issued on the hardware, or 0 if it's safe not
+ * to do so. */
+{
+ return v->arch.shadow.mode->invlpg(v, va);
+}
+
+static inline unsigned long
+shadow_gva_to_gpa(struct vcpu *v, unsigned long va)
+/* Called to translate a guest virtual address to what the *guest*
+ * pagetables would map it to. */
+{
+ return v->arch.shadow.mode->gva_to_gpa(v, va);
+}
+
+static inline unsigned long
+shadow_gva_to_gfn(struct vcpu *v, unsigned long va)
+/* Called to translate a guest virtual address to what the *guest*
+ * pagetables would map it to. */
+{
+ return v->arch.shadow.mode->gva_to_gfn(v, va);
+}
+
+static inline void
+shadow_update_cr3(struct vcpu *v)
+/* Updates all the things that are derived from the guest's CR3.
+ * Called when the guest changes CR3. */
+{
+ shadow_lock(v->domain);
+ v->arch.shadow.mode->update_cr3(v);
+ shadow_unlock(v->domain);
+}
+
+
+/* Should be called after CR3 is updated.
+ * Updates vcpu->arch.cr3 and, for HVM guests, vcpu->arch.hvm_vcpu.cpu_cr3.
+ *
+ * Also updates other state derived from CR3 (vcpu->arch.guest_vtable,
+ * shadow_vtable, etc).
+ *
+ * Uses values found in vcpu->arch.(guest_table and guest_table_user), and
+ * for HVM guests, arch.monitor_table and hvm's guest CR3.
+ *
+ * Update ref counts to shadow tables appropriately.
+ * For PAE, relocate L3 entries, if necessary, into low memory.
+ */
+static inline void update_cr3(struct vcpu *v)
+{
+ unsigned long cr3_mfn=0;
+
+ if ( shadow_mode_enabled(v->domain) )
+ {
+ shadow_update_cr3(v);
+ return;
+ }
+
+#if CONFIG_PAGING_LEVELS == 4
+ if ( !(v->arch.flags & TF_kernel_mode) )
+ cr3_mfn = pagetable_get_pfn(v->arch.guest_table_user);
+ else
+#endif
+ cr3_mfn = pagetable_get_pfn(v->arch.guest_table);
+
+ make_cr3(v, cr3_mfn);
+}
+
+extern void sh_update_paging_modes(struct vcpu *v);
+
+/* Should be called to initialise paging structures if the paging mode
+ * has changed, and when bringing up a VCPU for the first time. */
+static inline void shadow_update_paging_modes(struct vcpu *v)
+{
+ ASSERT(shadow_mode_enabled(v->domain));
+ shadow_lock(v->domain);
+ sh_update_paging_modes(v);
+ shadow_unlock(v->domain);
+}
+
+static inline void
+shadow_detach_old_tables(struct vcpu *v)
+{
+ if ( v->arch.shadow.mode )
+ v->arch.shadow.mode->detach_old_tables(v);
+}
+
+static inline mfn_t
+shadow_make_monitor_table(struct vcpu *v)
+{
+ return v->arch.shadow.mode->make_monitor_table(v);
+}
+
+static inline void
+shadow_destroy_monitor_table(struct vcpu *v, mfn_t mmfn)
+{
+ v->arch.shadow.mode->destroy_monitor_table(v, mmfn);
+}
+
+/* Validate a pagetable change from the guest and update the shadows. */
+extern int shadow_validate_guest_entry(struct vcpu *v, mfn_t gmfn,
+ void *new_guest_entry);
+
+/* Update the shadows in response to a pagetable write from a HVM guest */
+extern void shadow_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn,
+ void *entry, u32 size);
+
+/* Remove all writeable mappings of a guest frame from the shadows.
+ * Returns non-zero if we need to flush TLBs.
+ * level and fault_addr desribe how we found this to be a pagetable;
+ * level==0 means we have some other reason for revoking write access. */
+extern int shadow_remove_write_access(struct vcpu *v, mfn_t readonly_mfn,
+ unsigned int level,
+ unsigned long fault_addr);
+
+/* Remove all mappings of the guest mfn from the shadows.
+ * Returns non-zero if we need to flush TLBs. */
+extern int shadow_remove_all_mappings(struct vcpu *v, mfn_t target_mfn);
+
+void
+shadow_remove_all_shadows_and_parents(struct vcpu *v, mfn_t gmfn);
+/* This is a HVM page that we thing is no longer a pagetable.
+ * Unshadow it, and recursively unshadow pages that reference it. */
+
+/* Remove all shadows of the guest mfn. */
+extern void sh_remove_shadows(struct vcpu *v, mfn_t gmfn, int all);
+static inline void shadow_remove_all_shadows(struct vcpu *v, mfn_t gmfn)
+{
+ sh_remove_shadows(v, gmfn, 1);
+}
+
+/* Add a page to a domain */
+void
+shadow_guest_physmap_add_page(struct domain *d, unsigned long gfn,
+ unsigned long mfn);
+
+/* Remove a page from a domain */
+void
+shadow_guest_physmap_remove_page(struct domain *d, unsigned long gfn,
+ unsigned long mfn);
+
+/*
+ * Definitions for the shadow_flags field in page_info.
+ * These flags are stored on *guest* pages...
+ * Bits 1-13 are encodings for the shadow types.
+ */
+#define PGC_SH_type_to_index(_type) ((_type) >> PGC_SH_type_shift)
+#define SHF_page_type_mask \
+ (((1u << (PGC_SH_type_to_index(PGC_SH_max_shadow) + 1u)) - 1u) - \
+ ((1u << PGC_SH_type_to_index(PGC_SH_min_shadow)) - 1u))
+
+#define SHF_L1_32 (1u << PGC_SH_type_to_index(PGC_SH_l1_32_shadow))
+#define SHF_FL1_32 (1u << PGC_SH_type_to_index(PGC_SH_fl1_32_shadow))
+#define SHF_L2_32 (1u << PGC_SH_type_to_index(PGC_SH_l2_32_shadow))
+#define SHF_L1_PAE (1u << PGC_SH_type_to_index(PGC_SH_l1_pae_shadow))
+#define SHF_FL1_PAE (1u << PGC_SH_type_to_index(PGC_SH_fl1_pae_shadow))
+#define SHF_L2_PAE (1u << PGC_SH_type_to_index(PGC_SH_l2_pae_shadow))
+#define SHF_L2H_PAE (1u << PGC_SH_type_to_index(PGC_SH_l2h_pae_shadow))
+#define SHF_L3_PAE (1u << PGC_SH_type_to_index(PGC_SH_l3_pae_shadow))
+#define SHF_L1_64 (1u << PGC_SH_type_to_index(PGC_SH_l1_64_shadow))
+#define SHF_FL1_64 (1u << PGC_SH_type_to_index(PGC_SH_fl1_64_shadow))
+#define SHF_L2_64 (1u << PGC_SH_type_to_index(PGC_SH_l2_64_shadow))
+#define SHF_L3_64 (1u << PGC_SH_type_to_index(PGC_SH_l3_64_shadow))
+#define SHF_L4_64 (1u << PGC_SH_type_to_index(PGC_SH_l4_64_shadow))
+
+/* Used for hysteresis when automatically unhooking mappings on fork/exit */
+#define SHF_unhooked_mappings (1u<<31)
+
+/*
+ * Allocation of shadow pages
+ */
+
+/* Return the minumum acceptable number of shadow pages a domain needs */
+unsigned int shadow_min_acceptable_pages(struct domain *d);
+
+/* Set the pool of shadow pages to the required number of MB.
+ * Input will be rounded up to at least min_acceptable_shadow_pages().
+ * Returns 0 for success, 1 for failure. */
+unsigned int shadow_set_allocation(struct domain *d,
+ unsigned int megabytes,
+ int *preempted);
+
+/* Return the size of the shadow pool, rounded up to the nearest MB */
+static inline unsigned int shadow_get_allocation(struct domain *d)
+{
+ unsigned int pg = d->arch.shadow.total_pages;
+ return ((pg >> (20 - PAGE_SHIFT))
+ + ((pg & ((1 << (20 - PAGE_SHIFT)) - 1)) ? 1 : 0));
+}
+
+/*
+ * Linked list for chaining entries in the shadow hash table.
+ */
+struct shadow_hash_entry {
+ struct shadow_hash_entry *next;
+ mfn_t smfn; /* MFN of the shadow */
+#ifdef _x86_64_ /* Shorten 'n' so we don't waste a whole word on storing 't' */
+ unsigned long n:56; /* MFN of guest PT or GFN of guest superpage */
+#else
+ unsigned long n; /* MFN of guest PT or GFN of guest superpage */
+#endif
+ unsigned char t; /* shadow type bits, or 0 for empty */
+};
+
+#define SHADOW_HASH_BUCKETS 251
+/* Other possibly useful primes are 509, 1021, 2039, 4093, 8191, 16381 */
+
+
+#if SHADOW_OPTIMIZATIONS & SHOPT_CACHE_WALKS
+/* Optimization: cache the results of guest walks. This helps with MMIO
+ * and emulated writes, which tend to issue very similar walk requests
+ * repeatedly. We keep the results of the last few walks, and blow
+ * away the cache on guest cr3 write, mode change, or page fault. */
+
+#define SH_WALK_CACHE_ENTRIES 4
+
+/* Rather than cache a guest walk, which would include mapped pointers
+ * to pages, we cache what a TLB would remember about the walk: the
+ * permissions and the l1 gfn */
+struct shadow_walk_cache {
+ unsigned long va; /* The virtual address (or 0 == unused) */
+ unsigned long gfn; /* The gfn from the effective l1e */
+ u32 permissions; /* The aggregated permission bits */
+};
+#endif
+
+
+/**************************************************************************/
+/* Guest physmap (p2m) support */
+
+/* Walk another domain's P2M table, mapping pages as we go */
+extern mfn_t
+sh_gfn_to_mfn_foreign(struct domain *d, unsigned long gpfn);
+
+
+/* General conversion function from gfn to mfn */
+static inline mfn_t
+sh_gfn_to_mfn(struct domain *d, unsigned long gfn)
+{
+ if ( !shadow_mode_translate(d) )
+ return _mfn(gfn);
+ else if ( likely(current->domain == d) )
+ return _mfn(get_mfn_from_gpfn(gfn));
+ else
+ return sh_gfn_to_mfn_foreign(d, gfn);
+}
+
+// vcpu-specific version of gfn_to_mfn(). This is where we hide the dirty
+// little secret that, for hvm guests with paging disabled, nearly all of the
+// shadow code actually think that the guest is running on *untranslated* page
+// tables (which is actually domain->phys_table).
+//
+static inline mfn_t
+sh_vcpu_gfn_to_mfn(struct vcpu *v, unsigned long gfn)
+{
+ if ( !shadow_vcpu_mode_translate(v) )
+ return _mfn(gfn);
+ if ( likely(current->domain == v->domain) )
+ return _mfn(get_mfn_from_gpfn(gfn));
+ return sh_gfn_to_mfn_foreign(v->domain, gfn);
+}
+
+static inline unsigned long
+sh_mfn_to_gfn(struct domain *d, mfn_t mfn)
+{
+ if ( shadow_mode_translate(d) )
+ return get_gpfn_from_mfn(mfn_x(mfn));
+ else
+ return mfn_x(mfn);
+}
+
+
#endif /* _XEN_SHADOW_H */
* mode: C
* c-set-style: "BSD"
* c-basic-offset: 4
- * tab-width: 4
* indent-tabs-mode: nil
* End:
*/
+
+++ /dev/null
-/******************************************************************************
- * arch/x86/shadow2-multi.h
- *
- * Shadow2 declarations which will be multiply compiled.
- * Parts of this code are Copyright (c) 2006 by XenSource Inc.
- * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
- * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-extern int
-SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl1e, SHADOW_LEVELS, GUEST_LEVELS)(
- struct vcpu *v, mfn_t gl1mfn, void *new_gl1p, u32 size);
-extern int
-SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2e, SHADOW_LEVELS, GUEST_LEVELS)(
- struct vcpu *v, mfn_t gl2mfn, void *new_gl2p, u32 size);
-extern int
-SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2he, SHADOW_LEVELS, GUEST_LEVELS)(
- struct vcpu *v, mfn_t gl2mfn, void *new_gl2p, u32 size);
-extern int
-SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl3e, SHADOW_LEVELS, GUEST_LEVELS)(
- struct vcpu *v, mfn_t gl3mfn, void *new_gl3p, u32 size);
-extern int
-SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl4e, SHADOW_LEVELS, GUEST_LEVELS)(
- struct vcpu *v, mfn_t gl4mfn, void *new_gl4p, u32 size);
-
-extern void
-SHADOW2_INTERNAL_NAME(sh2_destroy_l1_shadow, SHADOW_LEVELS, GUEST_LEVELS)(
- struct vcpu *v, mfn_t smfn);
-extern void
-SHADOW2_INTERNAL_NAME(sh2_destroy_l2_shadow, SHADOW_LEVELS, GUEST_LEVELS)(
- struct vcpu *v, mfn_t smfn);
-extern void
-SHADOW2_INTERNAL_NAME(sh2_destroy_l3_shadow, SHADOW_LEVELS, GUEST_LEVELS)(
- struct vcpu *v, mfn_t smfn);
-extern void
-SHADOW2_INTERNAL_NAME(sh2_destroy_l4_shadow, SHADOW_LEVELS, GUEST_LEVELS)(
- struct vcpu *v, mfn_t smfn);
-
-extern void
-SHADOW2_INTERNAL_NAME(sh2_unpin_all_l3_subshadows, 3, 3)
- (struct vcpu *v, mfn_t smfn);
-
-extern void
-SHADOW2_INTERNAL_NAME(sh2_unhook_32b_mappings, SHADOW_LEVELS, GUEST_LEVELS)
- (struct vcpu *v, mfn_t sl2mfn);
-extern void
-SHADOW2_INTERNAL_NAME(sh2_unhook_pae_mappings, SHADOW_LEVELS, GUEST_LEVELS)
- (struct vcpu *v, mfn_t sl3mfn);
-extern void
-SHADOW2_INTERNAL_NAME(sh2_unhook_64b_mappings, SHADOW_LEVELS, GUEST_LEVELS)
- (struct vcpu *v, mfn_t sl4mfn);
-
-extern int
-SHADOW2_INTERNAL_NAME(sh2_remove_write_access, SHADOW_LEVELS, GUEST_LEVELS)
- (struct vcpu *v, mfn_t sl1mfn, mfn_t readonly_mfn);
-extern int
-SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings, SHADOW_LEVELS, GUEST_LEVELS)
- (struct vcpu *v, mfn_t sl1mfn, mfn_t target_mfn);
-
-extern void
-SHADOW2_INTERNAL_NAME(sh2_clear_shadow_entry, SHADOW_LEVELS, GUEST_LEVELS)
- (struct vcpu *v, void *ep, mfn_t smfn);
-
-extern int
-SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow, SHADOW_LEVELS, GUEST_LEVELS)
- (struct vcpu *v, mfn_t sl2mfn, mfn_t sl1mfn);
-extern int
-SHADOW2_INTERNAL_NAME(sh2_remove_l2_shadow, SHADOW_LEVELS, GUEST_LEVELS)
- (struct vcpu *v, mfn_t sl3mfn, mfn_t sl2mfn);
-extern int
-SHADOW2_INTERNAL_NAME(sh2_remove_l3_shadow, SHADOW_LEVELS, GUEST_LEVELS)
- (struct vcpu *v, mfn_t sl4mfn, mfn_t sl3mfn);
-
-#if SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES
-int
-SHADOW2_INTERNAL_NAME(sh2_audit_l1_table, SHADOW_LEVELS, GUEST_LEVELS)
- (struct vcpu *v, mfn_t sl1mfn, mfn_t x);
-int
-SHADOW2_INTERNAL_NAME(sh2_audit_fl1_table, SHADOW_LEVELS, GUEST_LEVELS)
- (struct vcpu *v, mfn_t sl1mfn, mfn_t x);
-int
-SHADOW2_INTERNAL_NAME(sh2_audit_l2_table, SHADOW_LEVELS, GUEST_LEVELS)
- (struct vcpu *v, mfn_t sl2mfn, mfn_t x);
-int
-SHADOW2_INTERNAL_NAME(sh2_audit_l3_table, SHADOW_LEVELS, GUEST_LEVELS)
- (struct vcpu *v, mfn_t sl3mfn, mfn_t x);
-int
-SHADOW2_INTERNAL_NAME(sh2_audit_l4_table, SHADOW_LEVELS, GUEST_LEVELS)
- (struct vcpu *v, mfn_t sl4mfn, mfn_t x);
-#endif
-
-#if SHADOW_LEVELS == GUEST_LEVELS
-extern mfn_t
-SHADOW2_INTERNAL_NAME(sh2_make_monitor_table, SHADOW_LEVELS, GUEST_LEVELS)
- (struct vcpu *v);
-extern void
-SHADOW2_INTERNAL_NAME(sh2_destroy_monitor_table, SHADOW_LEVELS, GUEST_LEVELS)
- (struct vcpu *v, mfn_t mmfn);
-#endif
-
-extern struct shadow2_paging_mode
-SHADOW2_INTERNAL_NAME(sh2_paging_mode, SHADOW_LEVELS, GUEST_LEVELS);
+++ /dev/null
-/******************************************************************************
- * arch/x86/shadow2-private.h
- *
- * Shadow2 code that is private, and does not need to be multiply compiled.
- * Parts of this code are Copyright (c) 2006 by XenSource Inc.
- * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
- * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#ifndef _XEN_SHADOW2_PRIVATE_H
-#define _XEN_SHADOW2_PRIVATE_H
-
-// In order to override the definition of mfn_to_page, we make sure page.h has
-// been included...
-#include <asm/page.h>
-#include <xen/domain_page.h>
-#include <asm/x86_emulate.h>
-#include <asm/hvm/support.h>
-
-
-/******************************************************************************
- * Definitions for the use of the "available" bits in the shadow PTEs.
- *
- * Review of the low 12 bits of a shadow page table entry:
- *
- * in a guest: in a shadow:
- * Bit 11: _PAGE_AVAIL2, aka _PAGE_GNTTAB
- * Bit 10: _PAGE_AVAIL1 _PAGE_SHADOW_RW ("SW" below)
- * Bit 9: _PAGE_AVAIL0 _PAGE_SHADOW_PRESENT ("SP" below)
- * Bit 8: _PAGE_GLOBAL _PAGE_SHADOW_MMIO ("MMIO" below),
- * aka _PAGE_SHADOW_GUEST_NOT_PRESENT
- * Bit 7: _PAGE_PSE, aka _PAGE_PAT
- * Bit 6: _PAGE_DIRTY
- * Bit 5: _PAGE_ACCESSED
- * Bit 4: _PAGE_PCD
- * Bit 3: _PAGE_PWT
- * Bit 2: _PAGE_USER
- * Bit 1: _PAGE_RW ("GW" below)
- * Bit 0: _PAGE_PRESENT ("GP" below)
- *
- * Given a guest entry, as shown below, we can expect the following in the
- * corresponding shadow entry:
- *
- * Guest entry Shadow entry Commentary
- * ----------- ---------------- ---------------------------------------------
- * Maps
- * GP GW IO GP SP GW SW MMIO
- * -- -- ---- -- -- -- -- ----
- * - - - 0 0 0 0 0 The guest entry has not yet been shadowed.
- * 0 - - 0 0 0 0 1 The guest entry is marked not-present.
- * 1 1 no ? 1 ? 1 0 Writable entry in the guest.
- * 1 0 no ? 1 0 0 0 Read-only entry in the guest.
- * 1 1 yes 0 1 ? 1 1 Writable MMIO mapping in the guest.
- * 1 0 yes 0 1 0 0 1 Read-only MMIO mapping in the guest.
- *
- * Normally, we would expect that GP=1 in the guest to imply GP=1 in the
- * shadow, and similarly for GW=1. However, various functionality that may be
- * implemented via the shadow can cause GP or GW to be cleared in such cases.
- * A & D bit emulation is a prime example of such functionality.
- *
- * If _PAGE_SHADOW_PRESENT is zero, then the _PAGE_PRESENT bit in that same
- * entry will always be zero, too.
-
- * Bit 11 is used in debug builds as the _PAGE_GNTTAB bit in PV guests. It is
- * currently available for random (ab)use in shadow entries.
- *
- * Bit 8 (the global bit) could be propagated from an HVM guest to the shadow,
- * but currently there is no benefit, as the guest's TLB is flushed on every
- * transition of CR3 anyway due to the HVM exit/re-entry.
- *
- * In shadow entries in which the _PAGE_SHADOW_PRESENT is set, bit 8 is used
- * as the _PAGE_SHADOW_MMIO bit. In such entries, if _PAGE_SHADOW_MMIO is
- * set, then the entry contains the *gfn* directly from the corresponding
- * guest entry (not an mfn!!).
- *
- * Bit 7 is set in a guest L2 to signify a superpage entry. The current
- * shadow code splinters superpage mappings into 512 or 1024 4K mappings; the
- * resulting shadow L1 table is called an FL1. Note that there is no guest
- * page that corresponds to an FL1.
- *
- * Bit 7 in a guest L1 is the PAT2 bit. Currently we do not support PAT in
- * this shadow code.
- *
- * Bit 6 is the dirty bit.
- *
- * Bit 5 is the accessed bit.
- *
- * Bit 4 is the cache disable bit. If set in a guest, the hardware is
- * supposed to refuse to cache anything found via this entry. It can be set
- * in an L4e, L3e, L2e, or L1e. This shadow code currently does not support
- * cache disable bits. They are silently ignored.
- *
- * Bit 4 is a guest L1 is also the PAT1 bit. Currently we do not support PAT
- * in this shadow code.
- *
- * Bit 3 is the cache write-thru bit. If set in a guest, the hardware is
- * supposed to use write-thru instead of write-back caching for anything found
- * via this entry. It can be set in an L4e, L3e, L2e, or L1e. This shadow
- * code currently does not support cache write-thru bits. They are silently
- * ignored.
- *
- * Bit 3 is a guest L1 is also the PAT0 bit. Currently we do not support PAT
- * in this shadow code.
- *
- * Bit 2 is the user bit.
- *
- * Bit 1 is the read-write bit.
- *
- * Bit 0 is the present bit.
- */
-
-// Copy of the _PAGE_RW bit from the guest's PTE, appropriately zero'ed by
-// the appropriate shadow rules.
-#define _PAGE_SHADOW_RW _PAGE_AVAIL1
-
-// Copy of the _PAGE_PRESENT bit from the guest's PTE
-#define _PAGE_SHADOW_PRESENT _PAGE_AVAIL0
-
-// The matching guest entry maps MMIO space
-#define _PAGE_SHADOW_MMIO _PAGE_GLOBAL
-
-// Shadow flags value used when the guest is not present
-#define _PAGE_SHADOW_GUEST_NOT_PRESENT _PAGE_GLOBAL
-
-
-/******************************************************************************
- * Debug and error-message output
- */
-#define SHADOW2_PRINTK(_f, _a...) \
- debugtrace_printk("sh2: %s(): " _f, __func__, ##_a)
-#define SHADOW2_ERROR(_f, _a...) \
- printk("sh2 error: %s(): " _f, __func__, ##_a)
-#define SHADOW2_DEBUG(flag, _f, _a...) \
- do { \
- if (SHADOW2_DEBUG_ ## flag) \
- debugtrace_printk("sh2debug: %s(): " _f, __func__, ##_a); \
- } while (0)
-
-// The flags for use with SHADOW2_DEBUG:
-#define SHADOW2_DEBUG_PROPAGATE 0
-#define SHADOW2_DEBUG_MAKE_SHADOW 0
-#define SHADOW2_DEBUG_DESTROY_SHADOW 0
-#define SHADOW2_DEBUG_P2M 0
-#define SHADOW2_DEBUG_A_AND_D 0
-#define SHADOW2_DEBUG_EMULATE 0
-#define SHADOW2_DEBUG_LOGDIRTY 1
-
-
-/******************************************************************************
- * Auditing routines
- */
-
-#if SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES_FULL
-extern void shadow2_audit_tables(struct vcpu *v);
-#else
-#define shadow2_audit_tables(_v) do {} while(0)
-#endif
-
-#if SHADOW2_AUDIT & SHADOW2_AUDIT_P2M
-extern void shadow2_audit_p2m(struct domain *d);
-#else
-#define shadow2_audit_p2m(_d) do {} while(0)
-#endif
-
-
-/******************************************************************************
- * Mechanism for double-checking the optimized pagefault path: this
- * structure contains a record of actions taken by the fault handling
- * code. In paranoid mode, the fast-path code fills out one of these
- * structures (but doesn't take any actual action) and then the normal
- * path fills in another. When the fault handler finishes, the
- * two are compared */
-
-#ifdef SHADOW2_OPTIMIZATION_PARANOIA
-
-typedef struct shadow2_action_log sh2_log_t;
-struct shadow2_action_log {
- paddr_t ad[CONFIG_PAGING_LEVELS]; /* A & D bits propagated here */
- paddr_t mmio; /* Address of an mmio operation */
- int rv; /* Result of the fault handler */
-};
-
-/* There are two logs, one for the fast path, one for the normal path */
-enum sh2_log_type { log_slow = 0, log_fast= 1 };
-
-/* Alloc and zero the logs */
-static inline void sh2_init_log(struct vcpu *v)
-{
- if ( unlikely(!v->arch.shadow2.action_log) )
- v->arch.shadow2.action_log = xmalloc_array(sh2_log_t, 2);
- ASSERT(v->arch.shadow2.action_log);
- memset(v->arch.shadow2.action_log, 0, 2 * sizeof (sh2_log_t));
-}
-
-/* Log an A&D-bit update */
-static inline void sh2_log_ad(struct vcpu *v, paddr_t e, unsigned int level)
-{
- v->arch.shadow2.action_log[v->arch.shadow2.action_index].ad[level] = e;
-}
-
-/* Log an MMIO address */
-static inline void sh2_log_mmio(struct vcpu *v, paddr_t m)
-{
- v->arch.shadow2.action_log[v->arch.shadow2.action_index].mmio = m;
-}
-
-/* Log the result */
-static inline void sh2_log_rv(struct vcpu *v, int rv)
-{
- v->arch.shadow2.action_log[v->arch.shadow2.action_index].rv = rv;
-}
-
-/* Set which mode we're in */
-static inline void sh2_set_log_mode(struct vcpu *v, enum sh2_log_type t)
-{
- v->arch.shadow2.action_index = t;
-}
-
-/* Know not to take action, because we're only checking the mechanism */
-static inline int sh2_take_no_action(struct vcpu *v)
-{
- return (v->arch.shadow2.action_index == log_fast);
-}
-
-#else /* Non-paranoid mode: these logs do not exist */
-
-#define sh2_init_log(_v) do { (void)(_v); } while(0)
-#define sh2_set_log_mode(_v,_t) do { (void)(_v); } while(0)
-#define sh2_log_ad(_v,_e,_l) do { (void)(_v),(void)(_e),(void)(_l); } while (0)
-#define sh2_log_mmio(_v,_m) do { (void)(_v),(void)(_m); } while (0)
-#define sh2_log_rv(_v,_r) do { (void)(_v),(void)(_r); } while (0)
-#define sh2_take_no_action(_v) (((void)(_v)), 0)
-
-#endif /* SHADOW2_OPTIMIZATION_PARANOIA */
-
-
-/******************************************************************************
- * Macro for dealing with the naming of the internal names of the
- * shadow code's external entry points.
- */
-#define SHADOW2_INTERNAL_NAME_HIDDEN(name, shadow_levels, guest_levels) \
- name ## __shadow_ ## shadow_levels ## _guest_ ## guest_levels
-#define SHADOW2_INTERNAL_NAME(name, shadow_levels, guest_levels) \
- SHADOW2_INTERNAL_NAME_HIDDEN(name, shadow_levels, guest_levels)
-
-#if CONFIG_PAGING_LEVELS == 2
-#define GUEST_LEVELS 2
-#define SHADOW_LEVELS 2
-#include <asm/shadow2-multi.h>
-#undef GUEST_LEVELS
-#undef SHADOW_LEVELS
-#endif /* CONFIG_PAGING_LEVELS == 2 */
-
-#if CONFIG_PAGING_LEVELS == 3
-#define GUEST_LEVELS 2
-#define SHADOW_LEVELS 3
-#include <asm/shadow2-multi.h>
-#undef GUEST_LEVELS
-#undef SHADOW_LEVELS
-
-#define GUEST_LEVELS 3
-#define SHADOW_LEVELS 3
-#include <asm/shadow2-multi.h>
-#undef GUEST_LEVELS
-#undef SHADOW_LEVELS
-#endif /* CONFIG_PAGING_LEVELS == 3 */
-
-#if CONFIG_PAGING_LEVELS == 4
-#define GUEST_LEVELS 2
-#define SHADOW_LEVELS 3
-#include <asm/shadow2-multi.h>
-#undef GUEST_LEVELS
-#undef SHADOW_LEVELS
-
-#define GUEST_LEVELS 3
-#define SHADOW_LEVELS 3
-#include <asm/shadow2-multi.h>
-#undef GUEST_LEVELS
-#undef SHADOW_LEVELS
-
-#define GUEST_LEVELS 3
-#define SHADOW_LEVELS 4
-#include <asm/shadow2-multi.h>
-#undef GUEST_LEVELS
-#undef SHADOW_LEVELS
-
-#define GUEST_LEVELS 4
-#define SHADOW_LEVELS 4
-#include <asm/shadow2-multi.h>
-#undef GUEST_LEVELS
-#undef SHADOW_LEVELS
-#endif /* CONFIG_PAGING_LEVELS == 4 */
-
-
-/******************************************************************************
- * Various function declarations
- */
-
-/* x86 emulator support */
-extern struct x86_emulate_ops shadow2_emulator_ops;
-
-/* Hash table functions */
-mfn_t shadow2_hash_lookup(struct vcpu *v, unsigned long n, u8 t);
-void shadow2_hash_insert(struct vcpu *v, unsigned long n, u8 t, mfn_t smfn);
-void shadow2_hash_delete(struct vcpu *v, unsigned long n, u8 t, mfn_t smfn);
-
-/* shadow promotion */
-void shadow2_promote(struct vcpu *v, mfn_t gmfn, u32 type);
-void shadow2_demote(struct vcpu *v, mfn_t gmfn, u32 type);
-
-/* Shadow page allocation functions */
-void shadow2_prealloc(struct domain *d, unsigned int order);
-mfn_t shadow2_alloc(struct domain *d,
- u32 shadow_type,
- unsigned long backpointer);
-void shadow2_free(struct domain *d, mfn_t smfn);
-
-/* Function to convert a shadow to log-dirty */
-void shadow2_convert_to_log_dirty(struct vcpu *v, mfn_t smfn);
-
-/* Dispatcher function: call the per-mode function that will unhook the
- * non-Xen mappings in this top-level shadow mfn */
-void shadow2_unhook_mappings(struct vcpu *v, mfn_t smfn);
-
-/* Re-sync copies of PAE shadow L3 tables if they have been changed */
-void sh2_pae_recopy(struct domain *d);
-
-/* Install the xen mappings in various flavours of shadow */
-void sh2_install_xen_entries_in_l4(struct vcpu *v, mfn_t gl4mfn, mfn_t sl4mfn);
-void sh2_install_xen_entries_in_l2h(struct vcpu *v, mfn_t sl2hmfn);
-void sh2_install_xen_entries_in_l3(struct vcpu *v, mfn_t gl3mfn, mfn_t sl3mfn);
-void sh2_install_xen_entries_in_l2(struct vcpu *v, mfn_t gl2mfn, mfn_t sl2mfn);
-
-
-/******************************************************************************
- * MFN/page-info handling
- */
-
-// Override mfn_to_page from asm/page.h, which was #include'd above,
-// in order to make it work with our mfn type.
-#undef mfn_to_page
-#define mfn_to_page(_mfn) (frame_table + mfn_x(_mfn))
-
-// Override page_to_mfn from asm/page.h, which was #include'd above,
-// in order to make it work with our mfn type.
-#undef page_to_mfn
-#define page_to_mfn(_pg) (_mfn((_pg) - frame_table))
-
-// Override mfn_valid from asm/page.h, which was #include'd above,
-// in order to make it work with our mfn type.
-#undef mfn_valid
-#define mfn_valid(_mfn) (mfn_x(_mfn) < max_page)
-
-// Provide mfn_t-aware versions of common xen functions
-static inline void *
-sh2_map_domain_page(mfn_t mfn)
-{
- /* XXX Using the monitor-table as a map will happen here */
- return map_domain_page(mfn_x(mfn));
-}
-
-static inline void
-sh2_unmap_domain_page(void *p)
-{
- /* XXX Using the monitor-table as a map will happen here */
- unmap_domain_page(p);
-}
-
-static inline void *
-sh2_map_domain_page_global(mfn_t mfn)
-{
- /* XXX Using the monitor-table as a map will happen here */
- return map_domain_page_global(mfn_x(mfn));
-}
-
-static inline void
-sh2_unmap_domain_page_global(void *p)
-{
- /* XXX Using the monitor-table as a map will happen here */
- unmap_domain_page_global(p);
-}
-
-static inline int
-sh2_mfn_is_dirty(struct domain *d, mfn_t gmfn)
-/* Is this guest page dirty? Call only in log-dirty mode. */
-{
- unsigned long pfn;
- ASSERT(shadow2_mode_log_dirty(d));
- ASSERT(d->arch.shadow2.dirty_bitmap != NULL);
-
- /* We /really/ mean PFN here, even for non-translated guests. */
- pfn = get_gpfn_from_mfn(mfn_x(gmfn));
- if ( likely(VALID_M2P(pfn))
- && likely(pfn < d->arch.shadow2.dirty_bitmap_size)
- && test_bit(pfn, d->arch.shadow2.dirty_bitmap) )
- return 1;
-
- return 0;
-}
-
-static inline int
-sh2_mfn_is_a_page_table(mfn_t gmfn)
-{
- struct page_info *page = mfn_to_page(gmfn);
- struct domain *owner;
- unsigned long type_info;
-
- if ( !valid_mfn(gmfn) )
- return 0;
-
- owner = page_get_owner(page);
- if ( owner && shadow2_mode_refcounts(owner)
- && (page->count_info & PGC_page_table) )
- return 1;
-
- type_info = page->u.inuse.type_info & PGT_type_mask;
- return type_info && (type_info <= PGT_l4_page_table);
-}
-
-
-/**************************************************************************/
-/* Shadow-page refcounting. See comment in shadow2-common.c about the
- * use of struct page_info fields for shadow pages */
-
-void sh2_destroy_shadow(struct vcpu *v, mfn_t smfn);
-
-/* Increase the refcount of a shadow page. Arguments are the mfn to refcount,
- * and the physical address of the shadow entry that holds the ref (or zero
- * if the ref is held by something else) */
-static inline void sh2_get_ref(mfn_t smfn, paddr_t entry_pa)
-{
- u32 x, nx;
- struct page_info *page = mfn_to_page(smfn);
-
- ASSERT(mfn_valid(smfn));
-
- x = page->count_info & PGC_SH2_count_mask;
- nx = x + 1;
-
- if ( unlikely(nx & ~PGC_SH2_count_mask) )
- {
- SHADOW2_PRINTK("shadow ref overflow, gmfn=%" PRtype_info " smfn=%lx\n",
- page->u.inuse.type_info, mfn_x(smfn));
- domain_crash_synchronous();
- }
-
- /* Guarded by the shadow lock, so no need for atomic update */
- page->count_info &= ~PGC_SH2_count_mask;
- page->count_info |= nx;
-
- /* We remember the first shadow entry that points to each shadow. */
- if ( entry_pa != 0 && page->up == 0 )
- page->up = entry_pa;
-}
-
-
-/* Decrease the refcount of a shadow page. As for get_ref, takes the
- * physical address of the shadow entry that held this reference. */
-static inline void sh2_put_ref(struct vcpu *v, mfn_t smfn, paddr_t entry_pa)
-{
- u32 x, nx;
- struct page_info *page = mfn_to_page(smfn);
-
- ASSERT(mfn_valid(smfn));
- ASSERT(page_get_owner(page) == NULL);
-
- /* If this is the entry in the up-pointer, remove it */
- if ( entry_pa != 0 && page->up == entry_pa )
- page->up = 0;
-
- x = page->count_info & PGC_SH2_count_mask;
- nx = x - 1;
-
- if ( unlikely(x == 0) )
- {
- SHADOW2_PRINTK("shadow ref underflow, smfn=%lx oc=%08x t=%"
- PRtype_info "\n",
- mfn_x(smfn),
- page->count_info & PGC_SH2_count_mask,
- page->u.inuse.type_info);
- domain_crash_synchronous();
- }
-
- /* Guarded by the shadow lock, so no need for atomic update */
- page->count_info &= ~PGC_SH2_count_mask;
- page->count_info |= nx;
-
- if ( unlikely(nx == 0) )
- sh2_destroy_shadow(v, smfn);
-}
-
-
-/* Pin a shadow page: take an extra refcount and set the pin bit. */
-static inline void sh2_pin(mfn_t smfn)
-{
- struct page_info *page;
-
- ASSERT(mfn_valid(smfn));
- page = mfn_to_page(smfn);
- if ( !(page->count_info & PGC_SH2_pinned) )
- {
- sh2_get_ref(smfn, 0);
- page->count_info |= PGC_SH2_pinned;
- }
-}
-
-/* Unpin a shadow page: unset the pin bit and release the extra ref. */
-static inline void sh2_unpin(struct vcpu *v, mfn_t smfn)
-{
- struct page_info *page;
-
- ASSERT(mfn_valid(smfn));
- page = mfn_to_page(smfn);
- if ( page->count_info & PGC_SH2_pinned )
- {
- page->count_info &= ~PGC_SH2_pinned;
- sh2_put_ref(v, smfn, 0);
- }
-}
-
-/**************************************************************************/
-/* Guest physmap (p2m) support */
-
-/* Read our own P2M table, checking in the linear pagetables first to be
- * sure that we will succeed. Call this function if you expect it to
- * fail often, as it avoids page faults. If you expect to succeed, use
- * vcpu_gfn_to_mfn, which copy_from_user()s the entry */
-static inline mfn_t
-vcpu_gfn_to_mfn_nofault(struct vcpu *v, unsigned long gfn)
-{
- unsigned long entry_addr = (unsigned long) &phys_to_machine_mapping[gfn];
-#if CONFIG_PAGING_LEVELS >= 4
- l4_pgentry_t *l4e;
- l3_pgentry_t *l3e;
-#endif
- l2_pgentry_t *l2e;
- l1_pgentry_t *l1e;
-
- ASSERT(current == v);
- if ( !shadow2_vcpu_mode_translate(v) )
- return _mfn(gfn);
-
-#if CONFIG_PAGING_LEVELS > 2
- if ( gfn > (RO_MPT_VIRT_END - RO_MPT_VIRT_START) / sizeof(l1_pgentry_t) )
- /* This pfn is higher than the p2m map can hold */
- return _mfn(INVALID_MFN);
-#endif
-
- /* Walk the linear pagetables. Note that this is *not* the same as
- * the walk in sh2_gfn_to_mfn_foreign, which is walking the p2m map */
-#if CONFIG_PAGING_LEVELS >= 4
- l4e = __linear_l4_table + l4_linear_offset(entry_addr);
- if ( !(l4e_get_flags(*l4e) & _PAGE_PRESENT) ) return _mfn(INVALID_MFN);
- l3e = __linear_l3_table + l3_linear_offset(entry_addr);
- if ( !(l3e_get_flags(*l3e) & _PAGE_PRESENT) ) return _mfn(INVALID_MFN);
-#endif
- l2e = __linear_l2_table + l2_linear_offset(entry_addr);
- if ( !(l2e_get_flags(*l2e) & _PAGE_PRESENT) ) return _mfn(INVALID_MFN);
- l1e = __linear_l1_table + l1_linear_offset(entry_addr);
- if ( !(l1e_get_flags(*l1e) & _PAGE_PRESENT) ) return _mfn(INVALID_MFN);
-
- /* Safe to look at this part of the table */
- if ( l1e_get_flags(phys_to_machine_mapping[gfn]) & _PAGE_PRESENT )
- return _mfn(l1e_get_pfn(phys_to_machine_mapping[gfn]));
-
- return _mfn(INVALID_MFN);
-}
-
-
-#endif /* _XEN_SHADOW2_PRIVATE_H */
-
-/*
- * Local variables:
- * mode: C
- * c-set-style: "BSD"
- * c-basic-offset: 4
- * indent-tabs-mode: nil
- * End:
- */
+++ /dev/null
-/******************************************************************************
- * include/asm-x86/shadow2-types.h
- *
- * Parts of this code are Copyright (c) 2006 by XenSource Inc.
- * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
- * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#ifndef _XEN_SHADOW2_TYPES_H
-#define _XEN_SHADOW2_TYPES_H
-
-// Map a shadow page
-static inline void *
-map_shadow_page(mfn_t smfn)
-{
- // XXX -- Possible optimization/measurement question for 32-bit and PAE
- // hypervisors:
- // How often is this smfn already available in the shadow linear
- // table? Might it be worth checking that table first,
- // presumably using the reverse map hint in the page_info of this
- // smfn, rather than calling map_domain_page()?
- //
- return sh2_map_domain_page(smfn);
-}
-
-// matching unmap for map_shadow_page()
-static inline void
-unmap_shadow_page(void *p)
-{
- sh2_unmap_domain_page(p);
-}
-
-/*
- * Define various types for handling pagetabels, based on these options:
- * SHADOW_PAGING_LEVELS : Number of levels of shadow pagetables
- * GUEST_PAGING_LEVELS : Number of levels of guest pagetables
- */
-
-#if (CONFIG_PAGING_LEVELS < SHADOW_PAGING_LEVELS)
-#error Cannot have more levels of shadow pagetables than host pagetables
-#endif
-
-#if (SHADOW_PAGING_LEVELS < GUEST_PAGING_LEVELS)
-#error Cannot have more levels of guest pagetables than shadow pagetables
-#endif
-
-#if SHADOW_PAGING_LEVELS == 2
-#define SHADOW_L1_PAGETABLE_ENTRIES 1024
-#define SHADOW_L2_PAGETABLE_ENTRIES 1024
-#define SHADOW_L1_PAGETABLE_SHIFT 12
-#define SHADOW_L2_PAGETABLE_SHIFT 22
-#endif
-
-#if SHADOW_PAGING_LEVELS == 3
-#define SHADOW_L1_PAGETABLE_ENTRIES 512
-#define SHADOW_L2_PAGETABLE_ENTRIES 512
-#define SHADOW_L3_PAGETABLE_ENTRIES 4
-#define SHADOW_L1_PAGETABLE_SHIFT 12
-#define SHADOW_L2_PAGETABLE_SHIFT 21
-#define SHADOW_L3_PAGETABLE_SHIFT 30
-#endif
-
-#if SHADOW_PAGING_LEVELS == 4
-#define SHADOW_L1_PAGETABLE_ENTRIES 512
-#define SHADOW_L2_PAGETABLE_ENTRIES 512
-#define SHADOW_L3_PAGETABLE_ENTRIES 512
-#define SHADOW_L4_PAGETABLE_ENTRIES 512
-#define SHADOW_L1_PAGETABLE_SHIFT 12
-#define SHADOW_L2_PAGETABLE_SHIFT 21
-#define SHADOW_L3_PAGETABLE_SHIFT 30
-#define SHADOW_L4_PAGETABLE_SHIFT 39
-#endif
-
-/* Types of the shadow page tables */
-typedef l1_pgentry_t shadow_l1e_t;
-typedef l2_pgentry_t shadow_l2e_t;
-#if SHADOW_PAGING_LEVELS >= 3
-typedef l3_pgentry_t shadow_l3e_t;
-#if SHADOW_PAGING_LEVELS >= 4
-typedef l4_pgentry_t shadow_l4e_t;
-#endif
-#endif
-
-/* Access functions for them */
-static inline paddr_t shadow_l1e_get_paddr(shadow_l1e_t sl1e)
-{ return l1e_get_paddr(sl1e); }
-static inline paddr_t shadow_l2e_get_paddr(shadow_l2e_t sl2e)
-{ return l2e_get_paddr(sl2e); }
-#if SHADOW_PAGING_LEVELS >= 3
-static inline paddr_t shadow_l3e_get_paddr(shadow_l3e_t sl3e)
-{ return l3e_get_paddr(sl3e); }
-#if SHADOW_PAGING_LEVELS >= 4
-static inline paddr_t shadow_l4e_get_paddr(shadow_l4e_t sl4e)
-{ return l4e_get_paddr(sl4e); }
-#endif
-#endif
-
-static inline mfn_t shadow_l1e_get_mfn(shadow_l1e_t sl1e)
-{ return _mfn(l1e_get_pfn(sl1e)); }
-static inline mfn_t shadow_l2e_get_mfn(shadow_l2e_t sl2e)
-{ return _mfn(l2e_get_pfn(sl2e)); }
-#if SHADOW_PAGING_LEVELS >= 3
-static inline mfn_t shadow_l3e_get_mfn(shadow_l3e_t sl3e)
-{ return _mfn(l3e_get_pfn(sl3e)); }
-#if SHADOW_PAGING_LEVELS >= 4
-static inline mfn_t shadow_l4e_get_mfn(shadow_l4e_t sl4e)
-{ return _mfn(l4e_get_pfn(sl4e)); }
-#endif
-#endif
-
-static inline u32 shadow_l1e_get_flags(shadow_l1e_t sl1e)
-{ return l1e_get_flags(sl1e); }
-static inline u32 shadow_l2e_get_flags(shadow_l2e_t sl2e)
-{ return l2e_get_flags(sl2e); }
-#if SHADOW_PAGING_LEVELS >= 3
-static inline u32 shadow_l3e_get_flags(shadow_l3e_t sl3e)
-{ return l3e_get_flags(sl3e); }
-#if SHADOW_PAGING_LEVELS >= 4
-static inline u32 shadow_l4e_get_flags(shadow_l4e_t sl4e)
-{ return l4e_get_flags(sl4e); }
-#endif
-#endif
-
-static inline shadow_l1e_t
-shadow_l1e_remove_flags(shadow_l1e_t sl1e, u32 flags)
-{ l1e_remove_flags(sl1e, flags); return sl1e; }
-
-static inline shadow_l1e_t shadow_l1e_empty(void)
-{ return l1e_empty(); }
-static inline shadow_l2e_t shadow_l2e_empty(void)
-{ return l2e_empty(); }
-#if SHADOW_PAGING_LEVELS >= 3
-static inline shadow_l3e_t shadow_l3e_empty(void)
-{ return l3e_empty(); }
-#if SHADOW_PAGING_LEVELS >= 4
-static inline shadow_l4e_t shadow_l4e_empty(void)
-{ return l4e_empty(); }
-#endif
-#endif
-
-static inline shadow_l1e_t shadow_l1e_from_mfn(mfn_t mfn, u32 flags)
-{ return l1e_from_pfn(mfn_x(mfn), flags); }
-static inline shadow_l2e_t shadow_l2e_from_mfn(mfn_t mfn, u32 flags)
-{ return l2e_from_pfn(mfn_x(mfn), flags); }
-#if SHADOW_PAGING_LEVELS >= 3
-static inline shadow_l3e_t shadow_l3e_from_mfn(mfn_t mfn, u32 flags)
-{ return l3e_from_pfn(mfn_x(mfn), flags); }
-#if SHADOW_PAGING_LEVELS >= 4
-static inline shadow_l4e_t shadow_l4e_from_mfn(mfn_t mfn, u32 flags)
-{ return l4e_from_pfn(mfn_x(mfn), flags); }
-#endif
-#endif
-
-#define shadow_l1_table_offset(a) l1_table_offset(a)
-#define shadow_l2_table_offset(a) l2_table_offset(a)
-#define shadow_l3_table_offset(a) l3_table_offset(a)
-#define shadow_l4_table_offset(a) l4_table_offset(a)
-
-/**************************************************************************/
-/* Access to the linear mapping of shadow page tables. */
-
-/* Offsets into each level of the linear mapping for a virtual address. */
-#define shadow_l1_linear_offset(_a) \
- (((_a) & VADDR_MASK) >> SHADOW_L1_PAGETABLE_SHIFT)
-#define shadow_l2_linear_offset(_a) \
- (((_a) & VADDR_MASK) >> SHADOW_L2_PAGETABLE_SHIFT)
-#define shadow_l3_linear_offset(_a) \
- (((_a) & VADDR_MASK) >> SHADOW_L3_PAGETABLE_SHIFT)
-#define shadow_l4_linear_offset(_a) \
- (((_a) & VADDR_MASK) >> SHADOW_L4_PAGETABLE_SHIFT)
-
-/* Where to find each level of the linear mapping. For PV guests, we use
- * the shadow linear-map self-entry as many times as we need. For HVM
- * guests, the shadow doesn't have a linear-map self-entry so we must use
- * the monitor-table's linear-map entry N-1 times and then the shadow-map
- * entry once. */
-#define __sh2_linear_l1_table ((shadow_l1e_t *)(SH_LINEAR_PT_VIRT_START))
-#define __sh2_linear_l2_table ((shadow_l2e_t *) \
- (__sh2_linear_l1_table + shadow_l1_linear_offset(SH_LINEAR_PT_VIRT_START)))
-
-// shadow linear L3 and L4 tables only exist in 4 level paging...
-#if SHADOW_PAGING_LEVELS == 4
-#define __sh2_linear_l3_table ((shadow_l3e_t *) \
- (__sh2_linear_l2_table + shadow_l2_linear_offset(SH_LINEAR_PT_VIRT_START)))
-#define __sh2_linear_l4_table ((shadow_l4e_t *) \
- (__sh2_linear_l3_table + shadow_l3_linear_offset(SH_LINEAR_PT_VIRT_START)))
-#endif
-
-#define sh2_linear_l1_table(v) ({ \
- ASSERT(current == (v)); \
- __sh2_linear_l1_table; \
-})
-
-#define sh2_linear_l2_table(v) ({ \
- ASSERT(current == (v)); \
- ((shadow_l2e_t *) \
- (hvm_guest(v) ? __linear_l1_table : __sh2_linear_l1_table) + \
- shadow_l1_linear_offset(SH_LINEAR_PT_VIRT_START)); \
-})
-
-// shadow linear L3 and L4 tables only exist in 4 level paging...
-#if SHADOW_PAGING_LEVELS == 4
-#define sh2_linear_l3_table(v) ({ \
- ASSERT(current == (v)); \
- ((shadow_l3e_t *) \
- (hvm_guest(v) ? __linear_l2_table : __sh2_linear_l2_table) + \
- shadow_l2_linear_offset(SH_LINEAR_PT_VIRT_START)); \
-})
-
-// we use l4_pgentry_t instead of shadow_l4e_t below because shadow_l4e_t is
-// not defined for when xen_levels==4 & shadow_levels==3...
-#define sh2_linear_l4_table(v) ({ \
- ASSERT(current == (v)); \
- ((l4_pgentry_t *) \
- (hvm_guest(v) ? __linear_l3_table : __sh2_linear_l3_table) + \
- shadow_l3_linear_offset(SH_LINEAR_PT_VIRT_START)); \
-})
-#endif
-
-#if GUEST_PAGING_LEVELS == 2
-
-#include <asm/page-guest32.h>
-
-#define GUEST_L1_PAGETABLE_ENTRIES 1024
-#define GUEST_L2_PAGETABLE_ENTRIES 1024
-#define GUEST_L1_PAGETABLE_SHIFT 12
-#define GUEST_L2_PAGETABLE_SHIFT 22
-
-/* Type of the guest's frame numbers */
-TYPE_SAFE(u32,gfn)
-#define INVALID_GFN ((u32)(-1u))
-#define SH2_PRI_gfn "05x"
-
-/* Types of the guest's page tables */
-typedef l1_pgentry_32_t guest_l1e_t;
-typedef l2_pgentry_32_t guest_l2e_t;
-
-/* Access functions for them */
-static inline paddr_t guest_l1e_get_paddr(guest_l1e_t gl1e)
-{ return l1e_get_paddr_32(gl1e); }
-static inline paddr_t guest_l2e_get_paddr(guest_l2e_t gl2e)
-{ return l2e_get_paddr_32(gl2e); }
-
-static inline gfn_t guest_l1e_get_gfn(guest_l1e_t gl1e)
-{ return _gfn(l1e_get_paddr_32(gl1e) >> PAGE_SHIFT); }
-static inline gfn_t guest_l2e_get_gfn(guest_l2e_t gl2e)
-{ return _gfn(l2e_get_paddr_32(gl2e) >> PAGE_SHIFT); }
-
-static inline u32 guest_l1e_get_flags(guest_l1e_t gl1e)
-{ return l1e_get_flags_32(gl1e); }
-static inline u32 guest_l2e_get_flags(guest_l2e_t gl2e)
-{ return l2e_get_flags_32(gl2e); }
-
-static inline guest_l1e_t guest_l1e_add_flags(guest_l1e_t gl1e, u32 flags)
-{ l1e_add_flags_32(gl1e, flags); return gl1e; }
-static inline guest_l2e_t guest_l2e_add_flags(guest_l2e_t gl2e, u32 flags)
-{ l2e_add_flags_32(gl2e, flags); return gl2e; }
-
-static inline guest_l1e_t guest_l1e_from_gfn(gfn_t gfn, u32 flags)
-{ return l1e_from_pfn_32(gfn_x(gfn), flags); }
-static inline guest_l2e_t guest_l2e_from_gfn(gfn_t gfn, u32 flags)
-{ return l2e_from_pfn_32(gfn_x(gfn), flags); }
-
-#define guest_l1_table_offset(a) l1_table_offset_32(a)
-#define guest_l2_table_offset(a) l2_table_offset_32(a)
-
-/* The shadow types needed for the various levels. */
-#define PGC_SH2_l1_shadow PGC_SH2_l1_32_shadow
-#define PGC_SH2_l2_shadow PGC_SH2_l2_32_shadow
-#define PGC_SH2_fl1_shadow PGC_SH2_fl1_32_shadow
-
-#else /* GUEST_PAGING_LEVELS != 2 */
-
-#if GUEST_PAGING_LEVELS == 3
-#define GUEST_L1_PAGETABLE_ENTRIES 512
-#define GUEST_L2_PAGETABLE_ENTRIES 512
-#define GUEST_L3_PAGETABLE_ENTRIES 4
-#define GUEST_L1_PAGETABLE_SHIFT 12
-#define GUEST_L2_PAGETABLE_SHIFT 21
-#define GUEST_L3_PAGETABLE_SHIFT 30
-#else /* GUEST_PAGING_LEVELS == 4 */
-#define GUEST_L1_PAGETABLE_ENTRIES 512
-#define GUEST_L2_PAGETABLE_ENTRIES 512
-#define GUEST_L3_PAGETABLE_ENTRIES 512
-#define GUEST_L4_PAGETABLE_ENTRIES 512
-#define GUEST_L1_PAGETABLE_SHIFT 12
-#define GUEST_L2_PAGETABLE_SHIFT 21
-#define GUEST_L3_PAGETABLE_SHIFT 30
-#define GUEST_L4_PAGETABLE_SHIFT 39
-#endif
-
-/* Type of the guest's frame numbers */
-TYPE_SAFE(unsigned long,gfn)
-#define INVALID_GFN ((unsigned long)(-1ul))
-#define SH2_PRI_gfn "05lx"
-
-/* Types of the guest's page tables */
-typedef l1_pgentry_t guest_l1e_t;
-typedef l2_pgentry_t guest_l2e_t;
-typedef l3_pgentry_t guest_l3e_t;
-#if GUEST_PAGING_LEVELS >= 4
-typedef l4_pgentry_t guest_l4e_t;
-#endif
-
-/* Access functions for them */
-static inline paddr_t guest_l1e_get_paddr(guest_l1e_t gl1e)
-{ return l1e_get_paddr(gl1e); }
-static inline paddr_t guest_l2e_get_paddr(guest_l2e_t gl2e)
-{ return l2e_get_paddr(gl2e); }
-static inline paddr_t guest_l3e_get_paddr(guest_l3e_t gl3e)
-{ return l3e_get_paddr(gl3e); }
-#if GUEST_PAGING_LEVELS >= 4
-static inline paddr_t guest_l4e_get_paddr(guest_l4e_t gl4e)
-{ return l4e_get_paddr(gl4e); }
-#endif
-
-static inline gfn_t guest_l1e_get_gfn(guest_l1e_t gl1e)
-{ return _gfn(l1e_get_paddr(gl1e) >> PAGE_SHIFT); }
-static inline gfn_t guest_l2e_get_gfn(guest_l2e_t gl2e)
-{ return _gfn(l2e_get_paddr(gl2e) >> PAGE_SHIFT); }
-static inline gfn_t guest_l3e_get_gfn(guest_l3e_t gl3e)
-{ return _gfn(l3e_get_paddr(gl3e) >> PAGE_SHIFT); }
-#if GUEST_PAGING_LEVELS >= 4
-static inline gfn_t guest_l4e_get_gfn(guest_l4e_t gl4e)
-{ return _gfn(l4e_get_paddr(gl4e) >> PAGE_SHIFT); }
-#endif
-
-static inline u32 guest_l1e_get_flags(guest_l1e_t gl1e)
-{ return l1e_get_flags(gl1e); }
-static inline u32 guest_l2e_get_flags(guest_l2e_t gl2e)
-{ return l2e_get_flags(gl2e); }
-static inline u32 guest_l3e_get_flags(guest_l3e_t gl3e)
-{ return l3e_get_flags(gl3e); }
-#if GUEST_PAGING_LEVELS >= 4
-static inline u32 guest_l4e_get_flags(guest_l4e_t gl4e)
-{ return l4e_get_flags(gl4e); }
-#endif
-
-static inline guest_l1e_t guest_l1e_add_flags(guest_l1e_t gl1e, u32 flags)
-{ l1e_add_flags(gl1e, flags); return gl1e; }
-static inline guest_l2e_t guest_l2e_add_flags(guest_l2e_t gl2e, u32 flags)
-{ l2e_add_flags(gl2e, flags); return gl2e; }
-static inline guest_l3e_t guest_l3e_add_flags(guest_l3e_t gl3e, u32 flags)
-{ l3e_add_flags(gl3e, flags); return gl3e; }
-#if GUEST_PAGING_LEVELS >= 4
-static inline guest_l4e_t guest_l4e_add_flags(guest_l4e_t gl4e, u32 flags)
-{ l4e_add_flags(gl4e, flags); return gl4e; }
-#endif
-
-static inline guest_l1e_t guest_l1e_from_gfn(gfn_t gfn, u32 flags)
-{ return l1e_from_pfn(gfn_x(gfn), flags); }
-static inline guest_l2e_t guest_l2e_from_gfn(gfn_t gfn, u32 flags)
-{ return l2e_from_pfn(gfn_x(gfn), flags); }
-static inline guest_l3e_t guest_l3e_from_gfn(gfn_t gfn, u32 flags)
-{ return l3e_from_pfn(gfn_x(gfn), flags); }
-#if GUEST_PAGING_LEVELS >= 4
-static inline guest_l4e_t guest_l4e_from_gfn(gfn_t gfn, u32 flags)
-{ return l4e_from_pfn(gfn_x(gfn), flags); }
-#endif
-
-#define guest_l1_table_offset(a) l1_table_offset(a)
-#define guest_l2_table_offset(a) l2_table_offset(a)
-#define guest_l3_table_offset(a) l3_table_offset(a)
-#define guest_l4_table_offset(a) l4_table_offset(a)
-
-/* The shadow types needed for the various levels. */
-#if GUEST_PAGING_LEVELS == 3
-#define PGC_SH2_l1_shadow PGC_SH2_l1_pae_shadow
-#define PGC_SH2_fl1_shadow PGC_SH2_fl1_pae_shadow
-#define PGC_SH2_l2_shadow PGC_SH2_l2_pae_shadow
-#define PGC_SH2_l2h_shadow PGC_SH2_l2h_pae_shadow
-#define PGC_SH2_l3_shadow PGC_SH2_l3_pae_shadow
-#else
-#define PGC_SH2_l1_shadow PGC_SH2_l1_64_shadow
-#define PGC_SH2_fl1_shadow PGC_SH2_fl1_64_shadow
-#define PGC_SH2_l2_shadow PGC_SH2_l2_64_shadow
-#define PGC_SH2_l3_shadow PGC_SH2_l3_64_shadow
-#define PGC_SH2_l4_shadow PGC_SH2_l4_64_shadow
-#endif
-
-#endif /* GUEST_PAGING_LEVELS != 2 */
-
-#define VALID_GFN(m) (m != INVALID_GFN)
-
-static inline int
-valid_gfn(gfn_t m)
-{
- return VALID_GFN(gfn_x(m));
-}
-
-#if GUEST_PAGING_LEVELS == 2
-#define PGC_SH2_guest_root_type PGC_SH2_l2_32_shadow
-#elif GUEST_PAGING_LEVELS == 3
-#define PGC_SH2_guest_root_type PGC_SH2_l3_pae_shadow
-#else
-#define PGC_SH2_guest_root_type PGC_SH2_l4_64_shadow
-#endif
-
-/* Translation between mfns and gfns */
-static inline mfn_t
-vcpu_gfn_to_mfn(struct vcpu *v, gfn_t gfn)
-{
- return sh2_vcpu_gfn_to_mfn(v, gfn_x(gfn));
-}
-
-static inline gfn_t
-mfn_to_gfn(struct domain *d, mfn_t mfn)
-{
- return _gfn(sh2_mfn_to_gfn(d, mfn));
-}
-
-static inline paddr_t
-gfn_to_paddr(gfn_t gfn)
-{
- return ((paddr_t)gfn_x(gfn)) << PAGE_SHIFT;
-}
-
-/* Type used for recording a walk through guest pagetables. It is
- * filled in by the pagetable walk function, and also used as a cache
- * for later walks.
- * Any non-null pointer in this structure represents a mapping of guest
- * memory. We must always call walk_init() before using a walk_t, and
- * call walk_unmap() when we're done.
- * The "Effective l1e" field is used when there isn't an l1e to point to,
- * but we have fabricated an l1e for propagation to the shadow (e.g.,
- * for splintering guest superpages into many shadow l1 entries). */
-typedef struct shadow2_walk_t walk_t;
-struct shadow2_walk_t
-{
- unsigned long va; /* Address we were looking for */
-#if GUEST_PAGING_LEVELS >= 3
-#if GUEST_PAGING_LEVELS >= 4
- guest_l4e_t *l4e; /* Pointer to guest's level 4 entry */
-#endif
- guest_l3e_t *l3e; /* Pointer to guest's level 3 entry */
-#endif
- guest_l2e_t *l2e; /* Pointer to guest's level 2 entry */
- guest_l1e_t *l1e; /* Pointer to guest's level 1 entry */
- guest_l1e_t eff_l1e; /* Effective level 1 entry */
-#if GUEST_PAGING_LEVELS >= 3
-#if GUEST_PAGING_LEVELS >= 4
- mfn_t l4mfn; /* MFN that the level 4 entry is in */
-#endif
- mfn_t l3mfn; /* MFN that the level 3 entry is in */
-#endif
- mfn_t l2mfn; /* MFN that the level 2 entry is in */
- mfn_t l1mfn; /* MFN that the level 1 entry is in */
-};
-
-/* macros for dealing with the naming of the internal function names of the
- * shadow code's external entry points.
- */
-#define INTERNAL_NAME(name) \
- SHADOW2_INTERNAL_NAME(name, SHADOW_PAGING_LEVELS, GUEST_PAGING_LEVELS)
-
-/* macros for renaming the primary entry points, so that they are more
- * easily distinguished from a debugger
- */
-#define sh2_page_fault INTERNAL_NAME(sh2_page_fault)
-#define sh2_invlpg INTERNAL_NAME(sh2_invlpg)
-#define sh2_gva_to_gpa INTERNAL_NAME(sh2_gva_to_gpa)
-#define sh2_gva_to_gfn INTERNAL_NAME(sh2_gva_to_gfn)
-#define sh2_update_cr3 INTERNAL_NAME(sh2_update_cr3)
-#define sh2_remove_write_access INTERNAL_NAME(sh2_remove_write_access)
-#define sh2_remove_all_mappings INTERNAL_NAME(sh2_remove_all_mappings)
-#define sh2_remove_l1_shadow INTERNAL_NAME(sh2_remove_l1_shadow)
-#define sh2_remove_l2_shadow INTERNAL_NAME(sh2_remove_l2_shadow)
-#define sh2_remove_l3_shadow INTERNAL_NAME(sh2_remove_l3_shadow)
-#define sh2_map_and_validate_gl4e INTERNAL_NAME(sh2_map_and_validate_gl4e)
-#define sh2_map_and_validate_gl3e INTERNAL_NAME(sh2_map_and_validate_gl3e)
-#define sh2_map_and_validate_gl2e INTERNAL_NAME(sh2_map_and_validate_gl2e)
-#define sh2_map_and_validate_gl2he INTERNAL_NAME(sh2_map_and_validate_gl2he)
-#define sh2_map_and_validate_gl1e INTERNAL_NAME(sh2_map_and_validate_gl1e)
-#define sh2_destroy_l4_shadow INTERNAL_NAME(sh2_destroy_l4_shadow)
-#define sh2_destroy_l3_shadow INTERNAL_NAME(sh2_destroy_l3_shadow)
-#define sh2_destroy_l3_subshadow INTERNAL_NAME(sh2_destroy_l3_subshadow)
-#define sh2_unpin_all_l3_subshadows INTERNAL_NAME(sh2_unpin_all_l3_subshadows)
-#define sh2_destroy_l2_shadow INTERNAL_NAME(sh2_destroy_l2_shadow)
-#define sh2_destroy_l1_shadow INTERNAL_NAME(sh2_destroy_l1_shadow)
-#define sh2_unhook_32b_mappings INTERNAL_NAME(sh2_unhook_32b_mappings)
-#define sh2_unhook_pae_mappings INTERNAL_NAME(sh2_unhook_pae_mappings)
-#define sh2_unhook_64b_mappings INTERNAL_NAME(sh2_unhook_64b_mappings)
-#define sh2_paging_mode INTERNAL_NAME(sh2_paging_mode)
-#define sh2_detach_old_tables INTERNAL_NAME(sh2_detach_old_tables)
-#define sh2_x86_emulate_write INTERNAL_NAME(sh2_x86_emulate_write)
-#define sh2_x86_emulate_cmpxchg INTERNAL_NAME(sh2_x86_emulate_cmpxchg)
-#define sh2_x86_emulate_cmpxchg8b INTERNAL_NAME(sh2_x86_emulate_cmpxchg8b)
-#define sh2_audit_l1_table INTERNAL_NAME(sh2_audit_l1_table)
-#define sh2_audit_fl1_table INTERNAL_NAME(sh2_audit_fl1_table)
-#define sh2_audit_l2_table INTERNAL_NAME(sh2_audit_l2_table)
-#define sh2_audit_l3_table INTERNAL_NAME(sh2_audit_l3_table)
-#define sh2_audit_l4_table INTERNAL_NAME(sh2_audit_l4_table)
-#define sh2_guess_wrmap INTERNAL_NAME(sh2_guess_wrmap)
-#define sh2_clear_shadow_entry INTERNAL_NAME(sh2_clear_shadow_entry)
-
-/* sh2_make_monitor_table only depends on the number of shadow levels */
-#define sh2_make_monitor_table \
- SHADOW2_INTERNAL_NAME(sh2_make_monitor_table, \
- SHADOW_PAGING_LEVELS, \
- SHADOW_PAGING_LEVELS)
-#define sh2_destroy_monitor_table \
- SHADOW2_INTERNAL_NAME(sh2_destroy_monitor_table, \
- SHADOW_PAGING_LEVELS, \
- SHADOW_PAGING_LEVELS)
-
-
-#if GUEST_PAGING_LEVELS == 3
-/*
- * Accounting information stored in the shadow of PAE Guest L3 pages.
- * Because these "L3 pages" are only 32-bytes, it is inconvenient to keep
- * various refcounts, etc., on the page_info of their page. We provide extra
- * bookkeeping space in the shadow itself, and this is the structure
- * definition for that bookkeeping information.
- */
-struct pae_l3_bookkeeping {
- u32 vcpus; /* bitmap of which vcpus are currently storing
- * copies of this 32-byte page */
- u32 refcount; /* refcount for this 32-byte page */
- u8 pinned; /* is this 32-byte page pinned or not? */
-};
-
-// Convert a shadow entry pointer into a pae_l3_bookkeeping pointer.
-#define sl3p_to_info(_ptr) ((struct pae_l3_bookkeeping *) \
- (((unsigned long)(_ptr) & ~31) + 32))
-
-static void sh2_destroy_l3_subshadow(struct vcpu *v,
- shadow_l3e_t *sl3e);
-
-/* Increment a subshadow ref
- * Called with a pointer to the subshadow, and the mfn of the
- * *first* page of the overall shadow. */
-static inline void sh2_get_ref_l3_subshadow(shadow_l3e_t *sl3e, mfn_t smfn)
-{
- struct pae_l3_bookkeeping *bk = sl3p_to_info(sl3e);
-
- /* First ref to the subshadow takes a ref to the full shadow */
- if ( bk->refcount == 0 )
- sh2_get_ref(smfn, 0);
- if ( unlikely(++(bk->refcount) == 0) )
- {
- SHADOW2_PRINTK("shadow l3 subshadow ref overflow, smfn=%" SH2_PRI_mfn " sh=%p\n",
- mfn_x(smfn), sl3e);
- domain_crash_synchronous();
- }
-}
-
-/* Decrement a subshadow ref.
- * Called with a pointer to the subshadow, and the mfn of the
- * *first* page of the overall shadow. Calling this may cause the
- * entire shadow to disappear, so the caller must immediately unmap
- * the pointer after calling. */
-static inline void sh2_put_ref_l3_subshadow(struct vcpu *v,
- shadow_l3e_t *sl3e,
- mfn_t smfn)
-{
- struct pae_l3_bookkeeping *bk;
-
- bk = sl3p_to_info(sl3e);
-
- ASSERT(bk->refcount > 0);
- if ( --(bk->refcount) == 0 )
- {
- /* Need to destroy this subshadow */
- sh2_destroy_l3_subshadow(v, sl3e);
- /* Last ref to the subshadow had a ref to the full shadow */
- sh2_put_ref(v, smfn, 0);
- }
-}
-
-/* Pin a subshadow
- * Called with a pointer to the subshadow, and the mfn of the
- * *first* page of the overall shadow. */
-static inline void sh2_pin_l3_subshadow(shadow_l3e_t *sl3e, mfn_t smfn)
-{
- struct pae_l3_bookkeeping *bk = sl3p_to_info(sl3e);
-
-#if 0
- debugtrace_printk("%s smfn=%05lx offset=%ld\n",
- __func__, mfn_x(smfn),
- ((unsigned long)sl3e & ~PAGE_MASK) / 64);
-#endif
-
- if ( !bk->pinned )
- {
- bk->pinned = 1;
- sh2_get_ref_l3_subshadow(sl3e, smfn);
- }
-}
-
-/* Unpin a sub-shadow.
- * Called with a pointer to the subshadow, and the mfn of the
- * *first* page of the overall shadow. Calling this may cause the
- * entire shadow to disappear, so the caller must immediately unmap
- * the pointer after calling. */
-static inline void sh2_unpin_l3_subshadow(struct vcpu *v,
- shadow_l3e_t *sl3e,
- mfn_t smfn)
-{
- struct pae_l3_bookkeeping *bk = sl3p_to_info(sl3e);
-
-#if 0
- debugtrace_printk("%s smfn=%05lx offset=%ld\n",
- __func__, mfn_x(smfn),
- ((unsigned long)sl3e & ~PAGE_MASK) / 64);
-#endif
-
- if ( bk->pinned )
- {
- bk->pinned = 0;
- sh2_put_ref_l3_subshadow(v, sl3e, smfn);
- }
-}
-
-#endif /* GUEST_PAGING_LEVELS == 3 */
-
-#if SHADOW_PAGING_LEVELS == 3
-#define MFN_FITS_IN_HVM_CR3(_MFN) !(mfn_x(_MFN) >> 20)
-#endif
-
-#if SHADOW_PAGING_LEVELS == 2
-#define SH2_PRI_pte "08x"
-#else /* SHADOW_PAGING_LEVELS >= 3 */
-#ifndef __x86_64__
-#define SH2_PRI_pte "016llx"
-#else
-#define SH2_PRI_pte "016lx"
-#endif
-#endif /* SHADOW_PAGING_LEVELS >= 3 */
-
-#if GUEST_PAGING_LEVELS == 2
-#define SH2_PRI_gpte "08x"
-#else /* GUEST_PAGING_LEVELS >= 3 */
-#ifndef __x86_64__
-#define SH2_PRI_gpte "016llx"
-#else
-#define SH2_PRI_gpte "016lx"
-#endif
-#endif /* GUEST_PAGING_LEVELS >= 3 */
-
-static inline u32
-accumulate_guest_flags(walk_t *gw)
-{
- u32 accumulated_flags;
-
- // We accumulate the permission flags with bitwise ANDing.
- // This works for the PRESENT bit, RW bit, and USER bit.
- // For the NX bit, however, the polarity is wrong, so we accumulate the
- // inverse of the NX bit.
- //
- accumulated_flags = guest_l1e_get_flags(gw->eff_l1e) ^ _PAGE_NX_BIT;
- accumulated_flags &= guest_l2e_get_flags(*gw->l2e) ^ _PAGE_NX_BIT;
-
- // Note that PAE guests do not have USER or RW or NX bits in their L3s.
- //
-#if GUEST_PAGING_LEVELS == 3
- accumulated_flags &=
- ~_PAGE_PRESENT | (guest_l3e_get_flags(*gw->l3e) & _PAGE_PRESENT);
-#elif GUEST_PAGING_LEVELS >= 4
- accumulated_flags &= guest_l3e_get_flags(*gw->l3e) ^ _PAGE_NX_BIT;
- accumulated_flags &= guest_l4e_get_flags(*gw->l4e) ^ _PAGE_NX_BIT;
-#endif
-
- // Finally, revert the NX bit back to its original polarity
- accumulated_flags ^= _PAGE_NX_BIT;
-
- return accumulated_flags;
-}
-
-#endif /* _XEN_SHADOW2_TYPES_H */
-
-/*
- * Local variables:
- * mode: C
- * c-set-style: "BSD"
- * c-basic-offset: 4
- * indent-tabs-mode: nil
- * End:
- */
+++ /dev/null
-/******************************************************************************
- * include/asm-x86/shadow2.h
- *
- * Parts of this code are Copyright (c) 2006 by XenSource Inc.
- * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
- * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#ifndef _XEN_SHADOW2_H
-#define _XEN_SHADOW2_H
-
-#include <public/domctl.h>
-#include <xen/sched.h>
-#include <xen/perfc.h>
-#include <asm/flushtlb.h>
-
-/* Shadow PT operation mode : shadow-mode variable in arch_domain. */
-
-#define SHM2_shift 10
-/* We're in one of the shadow modes */
-#define SHM2_enable (1U << SHM2_shift)
-/* Refcounts based on shadow tables instead of guest tables */
-#define SHM2_refcounts (XEN_DOMCTL_SHADOW_ENABLE_REFCOUNT << SHM2_shift)
-/* Enable log dirty mode */
-#define SHM2_log_dirty (XEN_DOMCTL_SHADOW_ENABLE_LOG_DIRTY << SHM2_shift)
-/* Xen does p2m translation, not guest */
-#define SHM2_translate (XEN_DOMCTL_SHADOW_ENABLE_TRANSLATE << SHM2_shift)
-/* Xen does not steal address space from the domain for its own booking;
- * requires VT or similar mechanisms */
-#define SHM2_external (XEN_DOMCTL_SHADOW_ENABLE_EXTERNAL << SHM2_shift)
-
-#define shadow2_mode_enabled(_d) ((_d)->arch.shadow2.mode)
-#define shadow2_mode_refcounts(_d) ((_d)->arch.shadow2.mode & SHM2_refcounts)
-#define shadow2_mode_log_dirty(_d) ((_d)->arch.shadow2.mode & SHM2_log_dirty)
-#define shadow2_mode_translate(_d) ((_d)->arch.shadow2.mode & SHM2_translate)
-#define shadow2_mode_external(_d) ((_d)->arch.shadow2.mode & SHM2_external)
-
-/* Xen traps & emulates all reads of all page table pages:
- *not yet supported
- */
-#define shadow2_mode_trap_reads(_d) ({ (void)(_d); 0; })
-
-// flags used in the return value of the shadow_set_lXe() functions...
-#define SHADOW2_SET_CHANGED 0x1
-#define SHADOW2_SET_FLUSH 0x2
-#define SHADOW2_SET_ERROR 0x4
-#define SHADOW2_SET_L3PAE_RECOPY 0x8
-
-// How do we tell that we have a 32-bit PV guest in a 64-bit Xen?
-#ifdef __x86_64__
-#define pv_32bit_guest(_v) 0 // not yet supported
-#else
-#define pv_32bit_guest(_v) !hvm_guest(v)
-#endif
-
-/* The shadow2 lock.
- *
- * This lock is per-domain. It is intended to allow us to make atomic
- * updates to the software TLB that the shadow tables provide.
- *
- * Specifically, it protects:
- * - all changes to shadow page table pages
- * - the shadow hash table
- * - the shadow page allocator
- * - all changes to guest page table pages; if/when the notion of
- * out-of-sync pages is added to this code, then the shadow lock is
- * protecting all guest page table pages which are not listed as
- * currently as both guest-writable and out-of-sync...
- * XXX -- need to think about this relative to writable page tables.
- * - all changes to the page_info->tlbflush_timestamp
- * - the page_info->count fields on shadow pages
- * - the shadow dirty bit array and count
- * - XXX
- */
-#ifndef CONFIG_SMP
-#error shadow2.h currently requires CONFIG_SMP
-#endif
-
-#define shadow2_lock_init(_d) \
- do { \
- spin_lock_init(&(_d)->arch.shadow2.lock); \
- (_d)->arch.shadow2.locker = -1; \
- (_d)->arch.shadow2.locker_function = "nobody"; \
- } while (0)
-
-#define shadow2_lock_is_acquired(_d) \
- (current->processor == (_d)->arch.shadow2.locker)
-
-#define shadow2_lock(_d) \
- do { \
- if ( unlikely((_d)->arch.shadow2.locker == current->processor) ) \
- { \
- printk("Error: shadow2 lock held by %s\n", \
- (_d)->arch.shadow2.locker_function); \
- BUG(); \
- } \
- spin_lock(&(_d)->arch.shadow2.lock); \
- ASSERT((_d)->arch.shadow2.locker == -1); \
- (_d)->arch.shadow2.locker = current->processor; \
- (_d)->arch.shadow2.locker_function = __func__; \
- } while (0)
-
-#define shadow2_unlock(_d) \
- do { \
- ASSERT((_d)->arch.shadow2.locker == current->processor); \
- (_d)->arch.shadow2.locker = -1; \
- (_d)->arch.shadow2.locker_function = "nobody"; \
- spin_unlock(&(_d)->arch.shadow2.lock); \
- } while (0)
-
-/*
- * Levels of self-test and paranoia
- * XXX should go in config files somewhere?
- */
-#define SHADOW2_AUDIT_HASH 0x01 /* Check current hash bucket */
-#define SHADOW2_AUDIT_HASH_FULL 0x02 /* Check every hash bucket */
-#define SHADOW2_AUDIT_ENTRIES 0x04 /* Check this walk's shadows */
-#define SHADOW2_AUDIT_ENTRIES_FULL 0x08 /* Check every shadow */
-#define SHADOW2_AUDIT_ENTRIES_MFNS 0x10 /* Check gfn-mfn map in shadows */
-#define SHADOW2_AUDIT_P2M 0x20 /* Check the p2m table */
-
-#ifdef NDEBUG
-#define SHADOW2_AUDIT 0
-#define SHADOW2_AUDIT_ENABLE 0
-#else
-#define SHADOW2_AUDIT 0x15 /* Basic audit of all except p2m. */
-#define SHADOW2_AUDIT_ENABLE shadow2_audit_enable
-extern int shadow2_audit_enable;
-#endif
-
-/*
- * Levels of optimization
- * XXX should go in config files somewhere?
- */
-#define SH2OPT_WRITABLE_HEURISTIC 0x01 /* Guess at RW PTEs via linear maps */
-#define SH2OPT_EARLY_UNSHADOW 0x02 /* Unshadow l1s on fork or exit */
-
-#define SHADOW2_OPTIMIZATIONS 0x03
-
-
-/* With shadow pagetables, the different kinds of address start
- * to get get confusing.
- *
- * Virtual addresses are what they usually are: the addresses that are used
- * to accessing memory while the guest is running. The MMU translates from
- * virtual addresses to machine addresses.
- *
- * (Pseudo-)physical addresses are the abstraction of physical memory the
- * guest uses for allocation and so forth. For the purposes of this code,
- * we can largely ignore them.
- *
- * Guest frame numbers (gfns) are the entries that the guest puts in its
- * pagetables. For normal paravirtual guests, they are actual frame numbers,
- * with the translation done by the guest.
- *
- * Machine frame numbers (mfns) are the entries that the hypervisor puts
- * in the shadow page tables.
- *
- * Elsewhere in the xen code base, the name "gmfn" is generally used to refer
- * to a "machine frame number, from the guest's perspective", or in other
- * words, pseudo-physical frame numbers. However, in the shadow code, the
- * term "gmfn" means "the mfn of a guest page"; this combines naturally with
- * other terms such as "smfn" (the mfn of a shadow page), gl2mfn (the mfn of a
- * guest L2 page), etc...
- */
-
-/* With this defined, we do some ugly things to force the compiler to
- * give us type safety between mfns and gfns and other integers.
- * TYPE_SAFE(int foo) defines a foo_t, and _foo() and foo_x() functions
- * that translate beween int and foo_t.
- *
- * It does have some performance cost because the types now have
- * a different storage attribute, so may not want it on all the time. */
-#ifndef NDEBUG
-#define TYPE_SAFETY 1
-#endif
-
-#ifdef TYPE_SAFETY
-#define TYPE_SAFE(_type,_name) \
-typedef struct { _type _name; } _name##_t; \
-static inline _name##_t _##_name(_type n) { return (_name##_t) { n }; } \
-static inline _type _name##_x(_name##_t n) { return n._name; }
-#else
-#define TYPE_SAFE(_type,_name) \
-typedef _type _name##_t; \
-static inline _name##_t _##_name(_type n) { return n; } \
-static inline _type _name##_x(_name##_t n) { return n; }
-#endif
-
-TYPE_SAFE(unsigned long,mfn)
-#define SH2_PRI_mfn "05lx"
-
-static inline int
-valid_mfn(mfn_t m)
-{
- return VALID_MFN(mfn_x(m));
-}
-
-static inline mfn_t
-pagetable_get_mfn(pagetable_t pt)
-{
- return _mfn(pagetable_get_pfn(pt));
-}
-
-static inline pagetable_t
-pagetable_from_mfn(mfn_t mfn)
-{
- return pagetable_from_pfn(mfn_x(mfn));
-}
-
-static inline int
-shadow2_vcpu_mode_translate(struct vcpu *v)
-{
- // Returns true if this VCPU needs to be using the P2M table to translate
- // between GFNs and MFNs.
- //
- // This is true of translated HVM domains on a vcpu which has paging
- // enabled. (HVM vcpu's with paging disabled are using the p2m table as
- // its paging table, so no translation occurs in this case.)
- //
- return v->arch.shadow2.hvm_paging_enabled;
-}
-
-
-/**************************************************************************/
-/* Mode-specific entry points into the shadow code */
-
-struct x86_emulate_ctxt;
-struct shadow2_paging_mode {
- int (*page_fault )(struct vcpu *v, unsigned long va,
- struct cpu_user_regs *regs);
- int (*invlpg )(struct vcpu *v, unsigned long va);
- unsigned long (*gva_to_gpa )(struct vcpu *v, unsigned long va);
- unsigned long (*gva_to_gfn )(struct vcpu *v, unsigned long va);
- void (*update_cr3 )(struct vcpu *v);
- int (*map_and_validate_gl1e )(struct vcpu *v, mfn_t gmfn,
- void *new_guest_entry, u32 size);
- int (*map_and_validate_gl2e )(struct vcpu *v, mfn_t gmfn,
- void *new_guest_entry, u32 size);
- int (*map_and_validate_gl2he)(struct vcpu *v, mfn_t gmfn,
- void *new_guest_entry, u32 size);
- int (*map_and_validate_gl3e )(struct vcpu *v, mfn_t gmfn,
- void *new_guest_entry, u32 size);
- int (*map_and_validate_gl4e )(struct vcpu *v, mfn_t gmfn,
- void *new_guest_entry, u32 size);
- void (*detach_old_tables )(struct vcpu *v);
- int (*x86_emulate_write )(struct vcpu *v, unsigned long va,
- void *src, u32 bytes,
- struct x86_emulate_ctxt *ctxt);
- int (*x86_emulate_cmpxchg )(struct vcpu *v, unsigned long va,
- unsigned long old,
- unsigned long new,
- unsigned int bytes,
- struct x86_emulate_ctxt *ctxt);
- int (*x86_emulate_cmpxchg8b )(struct vcpu *v, unsigned long va,
- unsigned long old_lo,
- unsigned long old_hi,
- unsigned long new_lo,
- unsigned long new_hi,
- struct x86_emulate_ctxt *ctxt);
- mfn_t (*make_monitor_table )(struct vcpu *v);
- void (*destroy_monitor_table )(struct vcpu *v, mfn_t mmfn);
-#if SHADOW2_OPTIMIZATIONS & SH2OPT_WRITABLE_HEURISTIC
- int (*guess_wrmap )(struct vcpu *v,
- unsigned long vaddr, mfn_t gmfn);
-#endif
- /* For outsiders to tell what mode we're in */
- unsigned int shadow_levels;
- unsigned int guest_levels;
-};
-
-static inline int shadow2_guest_paging_levels(struct vcpu *v)
-{
- ASSERT(v->arch.shadow2.mode != NULL);
- return v->arch.shadow2.mode->guest_levels;
-}
-
-/**************************************************************************/
-/* Entry points into the shadow code */
-
-/* Turning on shadow2 test mode */
-int shadow2_test_enable(struct domain *d);
-
-/* Handler for shadow control ops: enabling and disabling shadow modes,
- * and log-dirty bitmap ops all happen through here. */
-int shadow2_domctl(struct domain *d,
- xen_domctl_shadow_op_t *sc,
- XEN_GUEST_HANDLE(xen_domctl_t) u_domctl);
-
-/* Call when destroying a domain */
-void shadow2_teardown(struct domain *d);
-
-/* Call once all of the references to the domain have gone away */
-void shadow2_final_teardown(struct domain *d);
-
-
-/* Mark a page as dirty in the bitmap */
-void sh2_do_mark_dirty(struct domain *d, mfn_t gmfn);
-static inline void mark_dirty(struct domain *d, unsigned long gmfn)
-{
- if ( shadow2_mode_log_dirty(d) )
- {
- shadow2_lock(d);
- sh2_do_mark_dirty(d, _mfn(gmfn));
- shadow2_unlock(d);
- }
-}
-
-/* Internal version, for when the shadow lock is already held */
-static inline void sh2_mark_dirty(struct domain *d, mfn_t gmfn)
-{
- ASSERT(shadow2_lock_is_acquired(d));
- if ( shadow2_mode_log_dirty(d) )
- sh2_do_mark_dirty(d, gmfn);
-}
-
-static inline int
-shadow2_fault(unsigned long va, struct cpu_user_regs *regs)
-/* Called from pagefault handler in Xen, and from the HVM trap handlers
- * for pagefaults. Returns 1 if this fault was an artefact of the
- * shadow code (and the guest should retry) or 0 if it is not (and the
- * fault should be handled elsewhere or passed to the guest). */
-{
- struct vcpu *v = current;
- perfc_incrc(shadow2_fault);
- return v->arch.shadow2.mode->page_fault(v, va, regs);
-}
-
-static inline int
-shadow2_invlpg(struct vcpu *v, unsigned long va)
-/* Called when the guest requests an invlpg. Returns 1 if the invlpg
- * instruction should be issued on the hardware, or 0 if it's safe not
- * to do so. */
-{
- return v->arch.shadow2.mode->invlpg(v, va);
-}
-
-static inline unsigned long
-shadow2_gva_to_gpa(struct vcpu *v, unsigned long va)
-/* Called to translate a guest virtual address to what the *guest*
- * pagetables would map it to. */
-{
- return v->arch.shadow2.mode->gva_to_gpa(v, va);
-}
-
-static inline unsigned long
-shadow2_gva_to_gfn(struct vcpu *v, unsigned long va)
-/* Called to translate a guest virtual address to what the *guest*
- * pagetables would map it to. */
-{
- return v->arch.shadow2.mode->gva_to_gfn(v, va);
-}
-
-static inline void
-shadow2_update_cr3(struct vcpu *v)
-/* Updates all the things that are derived from the guest's CR3.
- * Called when the guest changes CR3. */
-{
- shadow2_lock(v->domain);
- v->arch.shadow2.mode->update_cr3(v);
- shadow2_unlock(v->domain);
-}
-
-
-/* Should be called after CR3 is updated.
- * Updates vcpu->arch.cr3 and, for HVM guests, vcpu->arch.hvm_vcpu.cpu_cr3.
- *
- * Also updates other state derived from CR3 (vcpu->arch.guest_vtable,
- * shadow_vtable, etc).
- *
- * Uses values found in vcpu->arch.(guest_table and guest_table_user), and
- * for HVM guests, arch.monitor_table and hvm's guest CR3.
- *
- * Update ref counts to shadow tables appropriately.
- * For PAE, relocate L3 entries, if necessary, into low memory.
- */
-static inline void update_cr3(struct vcpu *v)
-{
- unsigned long cr3_mfn=0;
-
- if ( shadow2_mode_enabled(v->domain) )
- {
- shadow2_update_cr3(v);
- return;
- }
-
-#if CONFIG_PAGING_LEVELS == 4
- if ( !(v->arch.flags & TF_kernel_mode) )
- cr3_mfn = pagetable_get_pfn(v->arch.guest_table_user);
- else
-#endif
- cr3_mfn = pagetable_get_pfn(v->arch.guest_table);
-
- make_cr3(v, cr3_mfn);
-}
-
-extern void sh2_update_paging_modes(struct vcpu *v);
-
-/* Should be called to initialise paging structures if the paging mode
- * has changed, and when bringing up a VCPU for the first time. */
-static inline void shadow2_update_paging_modes(struct vcpu *v)
-{
- ASSERT(shadow2_mode_enabled(v->domain));
- shadow2_lock(v->domain);
- sh2_update_paging_modes(v);
- shadow2_unlock(v->domain);
-}
-
-static inline void
-shadow2_detach_old_tables(struct vcpu *v)
-{
- if ( v->arch.shadow2.mode )
- v->arch.shadow2.mode->detach_old_tables(v);
-}
-
-static inline mfn_t
-shadow2_make_monitor_table(struct vcpu *v)
-{
- return v->arch.shadow2.mode->make_monitor_table(v);
-}
-
-static inline void
-shadow2_destroy_monitor_table(struct vcpu *v, mfn_t mmfn)
-{
- v->arch.shadow2.mode->destroy_monitor_table(v, mmfn);
-}
-
-/* Validate a pagetable change from the guest and update the shadows. */
-extern int shadow2_validate_guest_entry(struct vcpu *v, mfn_t gmfn,
- void *new_guest_entry);
-
-/* Update the shadows in response to a pagetable write from a HVM guest */
-extern void shadow2_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn,
- void *entry, u32 size);
-
-/* Remove all writeable mappings of a guest frame from the shadows.
- * Returns non-zero if we need to flush TLBs.
- * level and fault_addr desribe how we found this to be a pagetable;
- * level==0 means we have some other reason for revoking write access. */
-extern int shadow2_remove_write_access(struct vcpu *v, mfn_t readonly_mfn,
- unsigned int level,
- unsigned long fault_addr);
-
-/* Remove all mappings of the guest mfn from the shadows.
- * Returns non-zero if we need to flush TLBs. */
-extern int shadow2_remove_all_mappings(struct vcpu *v, mfn_t target_mfn);
-
-void
-shadow2_remove_all_shadows_and_parents(struct vcpu *v, mfn_t gmfn);
-/* This is a HVM page that we thing is no longer a pagetable.
- * Unshadow it, and recursively unshadow pages that reference it. */
-
-/* Remove all shadows of the guest mfn. */
-extern void sh2_remove_shadows(struct vcpu *v, mfn_t gmfn, int all);
-static inline void shadow2_remove_all_shadows(struct vcpu *v, mfn_t gmfn)
-{
- sh2_remove_shadows(v, gmfn, 1);
-}
-
-/* Add a page to a domain */
-void
-shadow2_guest_physmap_add_page(struct domain *d, unsigned long gfn,
- unsigned long mfn);
-
-/* Remove a page from a domain */
-void
-shadow2_guest_physmap_remove_page(struct domain *d, unsigned long gfn,
- unsigned long mfn);
-
-/*
- * Definitions for the shadow2_flags field in page_info.
- * These flags are stored on *guest* pages...
- * Bits 1-13 are encodings for the shadow types.
- */
-#define PGC_SH2_type_to_index(_type) ((_type) >> PGC_SH2_type_shift)
-#define SH2F_page_type_mask \
- (((1u << (PGC_SH2_type_to_index(PGC_SH2_max_shadow) + 1u)) - 1u) - \
- ((1u << PGC_SH2_type_to_index(PGC_SH2_min_shadow)) - 1u))
-
-#define SH2F_L1_32 (1u << PGC_SH2_type_to_index(PGC_SH2_l1_32_shadow))
-#define SH2F_FL1_32 (1u << PGC_SH2_type_to_index(PGC_SH2_fl1_32_shadow))
-#define SH2F_L2_32 (1u << PGC_SH2_type_to_index(PGC_SH2_l2_32_shadow))
-#define SH2F_L1_PAE (1u << PGC_SH2_type_to_index(PGC_SH2_l1_pae_shadow))
-#define SH2F_FL1_PAE (1u << PGC_SH2_type_to_index(PGC_SH2_fl1_pae_shadow))
-#define SH2F_L2_PAE (1u << PGC_SH2_type_to_index(PGC_SH2_l2_pae_shadow))
-#define SH2F_L2H_PAE (1u << PGC_SH2_type_to_index(PGC_SH2_l2h_pae_shadow))
-#define SH2F_L3_PAE (1u << PGC_SH2_type_to_index(PGC_SH2_l3_pae_shadow))
-#define SH2F_L1_64 (1u << PGC_SH2_type_to_index(PGC_SH2_l1_64_shadow))
-#define SH2F_FL1_64 (1u << PGC_SH2_type_to_index(PGC_SH2_fl1_64_shadow))
-#define SH2F_L2_64 (1u << PGC_SH2_type_to_index(PGC_SH2_l2_64_shadow))
-#define SH2F_L3_64 (1u << PGC_SH2_type_to_index(PGC_SH2_l3_64_shadow))
-#define SH2F_L4_64 (1u << PGC_SH2_type_to_index(PGC_SH2_l4_64_shadow))
-
-/* Used for hysteresis when automatically unhooking mappings on fork/exit */
-#define SH2F_unhooked_mappings (1u<<31)
-
-/*
- * Allocation of shadow pages
- */
-
-/* Return the minumum acceptable number of shadow pages a domain needs */
-unsigned int shadow2_min_acceptable_pages(struct domain *d);
-
-/* Set the pool of shadow pages to the required number of MB.
- * Input will be rounded up to at least min_acceptable_shadow_pages().
- * Returns 0 for success, 1 for failure. */
-unsigned int shadow2_set_allocation(struct domain *d,
- unsigned int megabytes,
- int *preempted);
-
-/* Return the size of the shadow2 pool, rounded up to the nearest MB */
-static inline unsigned int shadow2_get_allocation(struct domain *d)
-{
- unsigned int pg = d->arch.shadow2.total_pages;
- return ((pg >> (20 - PAGE_SHIFT))
- + ((pg & ((1 << (20 - PAGE_SHIFT)) - 1)) ? 1 : 0));
-}
-
-/*
- * Linked list for chaining entries in the shadow hash table.
- */
-struct shadow2_hash_entry {
- struct shadow2_hash_entry *next;
- mfn_t smfn; /* MFN of the shadow */
-#ifdef _x86_64_ /* Shorten 'n' so we don't waste a whole word on storing 't' */
- unsigned long n:56; /* MFN of guest PT or GFN of guest superpage */
-#else
- unsigned long n; /* MFN of guest PT or GFN of guest superpage */
-#endif
- unsigned char t; /* shadow type bits, or 0 for empty */
-};
-
-#define SHADOW2_HASH_BUCKETS 251
-/* Other possibly useful primes are 509, 1021, 2039, 4093, 8191, 16381 */
-
-
-#if SHADOW2_OPTIMIZATIONS & SH2OPT_CACHE_WALKS
-/* Optimization: cache the results of guest walks. This helps with MMIO
- * and emulated writes, which tend to issue very similar walk requests
- * repeatedly. We keep the results of the last few walks, and blow
- * away the cache on guest cr3 write, mode change, or page fault. */
-
-#define SH2_WALK_CACHE_ENTRIES 4
-
-/* Rather than cache a guest walk, which would include mapped pointers
- * to pages, we cache what a TLB would remember about the walk: the
- * permissions and the l1 gfn */
-struct shadow2_walk_cache {
- unsigned long va; /* The virtual address (or 0 == unused) */
- unsigned long gfn; /* The gfn from the effective l1e */
- u32 permissions; /* The aggregated permission bits */
-};
-#endif
-
-
-/**************************************************************************/
-/* Guest physmap (p2m) support */
-
-/* Walk another domain's P2M table, mapping pages as we go */
-extern mfn_t
-sh2_gfn_to_mfn_foreign(struct domain *d, unsigned long gpfn);
-
-
-/* General conversion function from gfn to mfn */
-static inline mfn_t
-sh2_gfn_to_mfn(struct domain *d, unsigned long gfn)
-{
- if ( !shadow2_mode_translate(d) )
- return _mfn(gfn);
- else if ( likely(current->domain == d) )
- return _mfn(get_mfn_from_gpfn(gfn));
- else
- return sh2_gfn_to_mfn_foreign(d, gfn);
-}
-
-// vcpu-specific version of gfn_to_mfn(). This is where we hide the dirty
-// little secret that, for hvm guests with paging disabled, nearly all of the
-// shadow code actually think that the guest is running on *untranslated* page
-// tables (which is actually domain->phys_table).
-//
-static inline mfn_t
-sh2_vcpu_gfn_to_mfn(struct vcpu *v, unsigned long gfn)
-{
- if ( !shadow2_vcpu_mode_translate(v) )
- return _mfn(gfn);
- if ( likely(current->domain == v->domain) )
- return _mfn(get_mfn_from_gpfn(gfn));
- return sh2_gfn_to_mfn_foreign(v->domain, gfn);
-}
-
-static inline unsigned long
-sh2_mfn_to_gfn(struct domain *d, mfn_t mfn)
-{
- if ( shadow2_mode_translate(d) )
- return get_gpfn_from_mfn(mfn_x(mfn));
- else
- return mfn_x(mfn);
-}
-
-
-
-#endif /* _XEN_SHADOW2_H */
-
-/*
- * Local variables:
- * mode: C
- * c-set-style: "BSD"
- * c-basic-offset: 4
- * indent-tabs-mode: nil
- * End:
- */
-